text_indexing 0.14.3+1 copy "text_indexing: ^0.14.3+1" to clipboard
text_indexing: ^0.14.3+1 copied to clipboard

outdated

Dart library for creating an inverted index on a collection of text documents.

example/text_indexing_example.dart

// Copyright ©2022, GM Consult (Pty) Ltd
// BSD 3-Clause License
// All rights reserved

// ignore_for_file: deprecated_member_use_from_same_package

import 'package:text_indexing/text_indexing.dart';

/// Two examples using the indexers in this package are provided:
/// - [_inMemoryIndexerExample] is a simple example of a [TextIndexer.inMemory]
///   indexing the [textData] dataset; and
/// - [_asyncIndexerExample] is a simple example of a [TextIndexer.async]
///   indexing the [textData] dataset.
void main() async {
  //

  const searchPhrase = 'stock market tesla EV battery';

  const zones = {
    'name': 1.0,
    'description': 0.5,
    'hashTag': 2.0,
    'publicationDate': 0.1
  };

  // Run a simple example of the [InMemoryIndexer] on the [textData] dataset.
  await _inMemoryIndexerExample(textData, searchPhrase, zones);

  //  Run a simple example of the [AsyncIndexer] on the [textData] dataset.
  await _asyncIndexerExample(jsonData, searchPhrase, zones);

  //
}

/// A simple example of the [TextIndexer.inMemory] on the [documents] dataset:
/// - initialize the [Dictionary];
/// - initialize the [Postings];
/// - initialize a [TextIndexer];
/// - iterate through the sample data, indexing each document in turn; and
/// - print the top 5 most popular [Dictionary.terms].
Future<void> _inMemoryIndexerExample(Map<String, String> documents,
    String searchPhrase, Map<String, double> zones) async {
  //

  // - initialize the [Dictionary]
  final Dictionary dictionary = {};

  // - initialize the [Postings]
  final Postings postings = {};

  // - initialize the [KGramIndex]
  final KGramIndex kGramIndex = {};

  // - initialize the index
  final index = InMemoryIndex(
      tokenizer: TextTokenizer(),
      dictionary: dictionary,
      postings: postings,
      kGramIndex: kGramIndex,
      zones: zones,
      phraseLength: 2,
      k: 3);

  // - initialize a TextIndexer with the index
  final indexer = TextIndexer(index: index);

  /// - tokenize a phrase into searh terms
  final searchTerms = (await indexer.index.tokenizer.tokenize(searchPhrase));

  // - iterate through the sample data
  await Future.forEach(documents.entries, (MapEntry<String, String> doc) async {
    // - index each document
    await indexer.indexText(doc.key, doc.value);
  });

  // print the statistics for each term in [searchTerms].
  await _printTermStats(indexer.index, searchTerms);
}

/// A simple test of the [TextIndexer.async] on a small JSON dataset using a
/// simulated persisted index repository with 50 millisecond latency on
/// read/write operations to [Dictionary] and [Postings] hashmaps:
/// - initialize the [_TestIndexRepository];
/// - initialize a [TextIndexer];
/// - iterate through the JSON documents and index each document; and
/// - print the statistics on 5 search terms.
Future<void> _asyncIndexerExample(Map<String, JSON> documents,
    String searchPhrase, Map<String, double> zones) async {
  //

// initialize a Set for the vocabulary state
  final termsSet = <String>{};

  // initialize a Set for the document ids state
  final docsSet = <String>{};

  // - initialize a [_TestIndexRepository()]
  final repository = _TestIndexRepository();

  final index = AsyncCallbackIndex(
      dictionaryLoader: repository.getDictionary,
      dictionaryUpdater: repository.upsertDictionary,
      dictionaryLengthLoader: () => repository.vocabularyLength,
      kGramIndexLoader: repository.getKGramIndex,
      kGramIndexUpdater: repository.upsertKGramIndex,
      postingsLoader: repository.getPostings,
      postingsUpdater: repository.upsertPostings,
      zones: zones,
      k: 3,
      phraseLength: 2,
      tokenizer: TextTokenizer());

  /// - tokenize a phrase into searh terms
  final searchTerms = (await index.tokenizer.tokenize(searchPhrase));

  // - initialize a [AsyncIndexer]
  final indexer = TextIndexer(index: index);

  print('Indexed ${termsSet.length} terms from ${docsSet.length} documents.');

  // - iterate through the sample data
  await indexer.indexCollection(jsonData);

  // wait for stream elements to complete printing
  await Future.delayed(const Duration(milliseconds: 250));

  // print the statistics for each term in [searchTerms].
  await _printTermStats(index, searchTerms);
}

/// Print the statistics for each term in [searchTerms].
Future<void> _printTermStats(
    InvertedIndex index, Iterable<Token> tokens) async {
  //

  // convert the tokens to terms
  final searchTerms = tokens.terms.toSet();

  // convert the tokens to k-grams
  final searchKgrams = tokens.kGrams(3);

  // get the start time in milliseconds
  final start = DateTime.now().millisecondsSinceEpoch;

  // retrieve the k-gram index for searchKgrams.keys from the index
  final kGramMap = await index.getKGramIndex(searchKgrams.keys);

  // map all the terms for the k-grams
  final kGramTerms = <String>{};
  for (final terms in kGramMap.values) {
    kGramTerms.addAll(terms);
  }

  // add the k-gram terms to the searchTerms
  searchTerms.addAll(kGramTerms);

  // get the inverse term frequency index for the searchTerms
  final iDftIndex = await index.getIdFtIndex(searchTerms);

  // get the term frequency in the corpus of the searchTerms
  final tFtIndex = await index.getTfIndex(searchTerms);

  // get the dictionary for searchTerms
  final dictionary = await index.getDictionary(searchTerms);

  // get the end time in milliseconds
  final end = DateTime.now().millisecondsSinceEpoch;

  // calculate the time taken to query the index in milliseconds
  final dT = ((end - start)).toStringAsFixed(3);

  // print the headings
  print(''.padLeft(80, '_'));
  print('DICTIONARY STATISTICS (search terms)');
  print(''.padLeft(80, '-'));
  print('${'Term'.padRight(10)}'
      '${'Term Frequency'.toString().padLeft(20)}'
      '${'Document Frequency'.toString().padLeft(20)}'
      '${'Inverse Document Frequency'.toString().padLeft(30)}');
  print(''.padLeft(80, '-'));

  // print the statistics
  for (final term in searchTerms) {
    final df = dictionary[term] ?? 0;
    final idf = iDftIndex[term] ?? 0.0;
    final tf = tFtIndex[term] ?? 0;
    print('${term.padRight(10)}'
        '${tf.toString().padLeft(20)}'
        '${df.toString().padLeft(20)}'
        '${idf.toStringAsFixed(2).padLeft(4, '0').padLeft(30)}');
  }
  print(''.padLeft(80, '-'));

  // print a closing line
  print(''.padLeft(80, '-'));

  // print the performance
  print('Retrieved');
  print('- dictionary for ${dictionary.length} terms;');
  print('- term frequencies for ${tFtIndex.length} terms;');
  print('- inverse document frequencies for ${iDftIndex.length} terms; and');
  print('- k-gram postings for ${kGramTerms.length} terms.');
  print('in $dT milliseconds.');
  print(''.padLeft(80, '-'));
  print(''.padLeft(80, '-'));
}

/// Four paragraphs of text used for testing.
///
/// Includes numbers, currencies, abbreviations, hyphens and identifiers
final textData = {
  'doc000': 'The Dow Jones rallied even as U.S. troops were put on alert amid '
      'the Ukraine crisis. Tesla stock fought back while Apple '
      'stock struggled. ',
  'doc001': '[TSLA.XNGS] Tesla\'s #TeslaMotor Stock Is Getting Hammered.',
  'doc002': 'Among the best EV stocks to buy and watch, Tesla '
      '(TSLA.XNGS) is pulling back from new highs after a failed breakout '
      'above a \$1,201.05 double-bottom entry. ',
  'doc003': 'Meanwhile, Peloton reportedly finds an activist investor knocking '
      'on its door after a major stock crash fueled by strong indications of '
      'mismanagement. In a scathing new letter released Monday, activist '
      'Tesla Capital is pushing for Peloton to fire CEO, Chairman and '
      'founder John Foley and explore a sale.'
};

/// JSON data used to demonstrate persisted indexing of fields in JSON documents.
final jsonData = {
  'ee1760a1-a259-50dc-b11d-8baf34d7d1c5': {
    'avatarImageUrl':
        'https://firebasestorage.googleapis.com/v0/b/buysellhold-322d1.appspot.com/o/logos%2FTSLA%3AXNGS.png?alt=media&token=c365db47-9482-4237-9267-82f72854d161',
    'description':
        'A 20-for-1 stock split gave a nice short-term boost to Amazon (AMZN) - Get Amazon.com Inc. Report in late May and in early June, while Alphabet (GOOGL) - Get Alphabet Inc. Report (GOOG) - Get Alphabet Inc. Report has a planned 20-for-1 stock split for next month. Tesla  (TSLA) - Get Tesla Inc. Report is also waiting on shareholder approval for a 3-for-1 stock split. ',
    'entityType': 'NewsItem',
    'hashTags': ['#Tesla'],
    'id': 'ee1760a1-a259-50dc-b11d-8baf34d7d1c5',
    'itemGuid':
        'trading-shopify-stock-ahead-of-10-for-1-stock-split-technical-analysis-june-2022?puc=yahoo&cm_ven=YAHOO&yptr=yahoo',
    'linkUrl':
        'https://www.thestreet.com/investing/trading-shopify-stock-ahead-of-10-for-1-stock-split-technical-analysis-june-2022?puc=yahoo&cm_ven=YAHOO&yptr=yahoo',
    'locale': 'Locale.en_US',
    'name': 'Shopify Stock Split What the Charts Say Ahead of 10-for-1 Split',
    'publicationDate': '2022-06-28T17:44:00.000Z',
    'publisher': {
      'linkUrl': 'http://www.thestreet.com/',
      'title': 'TheStreet com'
    },
    'timestamp': 1656464362162
  },
  'ee1d9610-b902-53e6-8264-840bd403365b': {
    'avatarImageUrl':
        'https://firebasestorage.googleapis.com/v0/b/buysellhold-322d1.appspot.com/o/logos%2FO%3AXNYS.png?alt=media&token=15b5e8fe-bec2-4711-b0f7-e5a631287e9e',
    'description': 'OR',
    'entityType': 'NewsItem',
    'hashTags': ['#RealtyIncome'],
    'id': 'ee1d9610-b902-53e6-8264-840bd403365b',
    'itemGuid': 'auddev&yptr=yahoo',
    'linkUrl':
        'https://www.ft.com/cms/s/0ea7bcc1-d3a6-4897-8da3-98798c3be487,s01=1.html?ftcamp=traffic/partner/feed_headline/us_yahoo/auddev&yptr=yahoo',
    'locale': 'Locale.en_US',
    'name': 'History says US stock market has further to fall',
    'publicationDate': '2022-06-25T12:35:45.000Z',
    'publisher': {'linkUrl': 'http://ft.com/', 'title': 'Financial Times'},
    'timestamp': 1656193158270
  },
  'ef3b0cb6-0297-502b-bd77-283246bc0014': {
    'avatarImageUrl':
        'https://firebasestorage.googleapis.com/v0/b/buysellhold-322d1.appspot.com/o/logos%2FJPM-C%3AXNYS.png?alt=media&token=deba8c6d-019e-4d49-9faa-d641a3cf1986',
    'description':
        'JPMorgan Sees ‘Stratospheric’ \$380 Oil on Worst-Case Russian Cut',
    'entityType': 'NewsItem',
    'hashTags': ['#JPMorganChase'],
    'id': 'ef3b0cb6-0297-502b-bd77-283246bc0014',
    'itemGuid': 'germany-risks-cascade-utility-failures-194853753.html',
    'linkUrl':
        'https://finance.yahoo.com/news/germany-risks-cascade-utility-failures-194853753.html',
    'locale': 'Locale.en_US',
    'name': 'Germany Risks a Cascade of Utility Failures Economy Chief Says',
    'publicationDate': '2022-07-02T19:48:53.000Z',
    'publisher': {
      'linkUrl': 'https://www.bloomberg.com/',
      'title': 'Bloomberg'
    },
    'timestamp': 1656802194091
  },
  'f1064cca-bf6d-5689-900b-ecb8769fe30b': {
    'avatarImageUrl':
        'https://firebasestorage.googleapis.com/v0/b/buysellhold-322d1.appspot.com/o/logos%2FINTC%3AXNGS.png?alt=media&token=cfefaa0a-7f06-42f8-a316-954ded1fd703',
    'description':
        'Under CEO Pat Gelsinger, Intel has committed to massively increasing its capital spending investments by tens of billions of dollars. But with a rapidly slowing global economy, repeated product delays, rising competitive threats, and political uncertainty, it might need more help to fund its ambitions. One good place to find it would be Intel (ticker: INTC) generous dividend payout. While the chip maker has paid a consistent dividend for three decades straight, it needs to do whatever it takes to shore up its future—up to and including cutting its dividend. Under CEO Pat Gelsinger, Intel has committed to massively increasing its capital spending investments by tens of billions of dollars.',
    'entityType': 'NewsItem',
    'hashTags': ['#Intel'],
    'id': 'f1064cca-bf6d-5689-900b-ecb8769fe30b',
    'itemGuid':
        'intel-stock-dividend-future-51656450364?siteid=yhoof2&yptr=yahoo',
    'linkUrl':
        'https://www.barrons.com/articles/intel-stock-dividend-future-51656450364?siteid=yhoof2&yptr=yahoo',
    'locale': 'Locale.en_US',
    'name':
        'Intel Should Slash Its Dividend The Chip Maker s Future May Depend on It',
    'publicationDate': '2022-06-29T12:00:00.000Z',
    'publisher': {'linkUrl': 'http://www.barrons.com/', 'title': 'Barrons com'},
    'timestamp': 1656540522708
  },
  'f2fa8eea-6259-5c83-8865-e0b7f80e691d': {
    'avatarImageUrl':
        'https://firebasestorage.googleapis.com/v0/b/buysellhold-322d1.appspot.com/o/logos%2FINTC%3AXNGS.png?alt=media&token=cfefaa0a-7f06-42f8-a316-954ded1fd703',
    'description':
        'Consumers are being hit by the run-up in gasoline, diesel and other oil products, Mike Muller, head of Asia at Vitol Group, said Sunday on a podcast produced by Dubai-based Gulf Intelligence.',
    'hashTags': ['#Intel', '#JPMorganChase'],
    'id': 'f2fa8eea-6259-5c83-8865-e0b7f80e691d',
    'itemGuid': 'surging-fuel-costs-causing-demand-100930872.html',
    'linkUrl':
        'https://finance.yahoo.com/news/surging-fuel-costs-causing-demand-100930872.html',
    'locale': 'Locale.en_US',
    'name': 'Surging Fuel Costs Are Causing Demand Destruction Says Vitol',
    'publicationDate': '2022-07-03T10:09:30.000Z',
    'publisher': {
      'linkUrl': 'https://www.bloomberg.com/',
      'title': 'Bloomberg'
    },
    'timestamp': 1656881075901
  },
  'f6ee5edf-094f-5892-9b37-71b8f2e90d03': {
    'avatarImageUrl':
        'https://firebasestorage.googleapis.com/v0/b/buysellhold-322d1.appspot.com/o/logos%2FAAPL%3AXNGS.png?alt=media&token=fb44cde6-4552-42e7-b1b0-7eddc92b1dfc',
    'description':
        'The Dow Jones Industrial Average rallied out of the red after the latest Fed minutes were released. EV stock Rivian (RIVN) soared on guidance even as Tesla (TSLA) fell. Microsoft (MSFT) and Apple (AAPL) were among the top blue chips.',
    'entityType': 'NewsItem',
    'hashTags': ['#Apple', '#Tesla'],
    'id': 'f6ee5edf-094f-5892-9b37-71b8f2e90d03',
    'itemGuid': '?src=A00220&yptr=yahoo',
    'linkUrl':
        'https://www.investors.com/market-trend/stock-market-today/dow-jones-rallies-as-fed-minutes-reveal-this-jerome-powell-ev-stock-explodes-on-guidance-tesla-stock-apple-stock-pops/?src=A00220&yptr=yahoo',
    'locale': 'Locale.en_US',
    'name':
        'Dow Jones Rallies As Fed Minutes Reveal This EV Stock Explodes On Guidance Apple Stock Vaults',
    'publicationDate': '2022-07-06T19:15:45.000Z',
    'publisher': {
      'linkUrl': 'http://www.investors.com/',
      'title': "Investor's Business Daily"
    },
    'timestamp': 1657158311943
  },
  'f83c0c39-8cc3-5b0e-91de-61e829ea65dc': {
    'avatarImageUrl':
        'https://firebasestorage.googleapis.com/v0/b/buysellhold-322d1.appspot.com/o/logos%2FXOM%3AXNYS.png?alt=media&token=c94499b0-5937-47ea-8f89-94d65d3ed065',
    'description':
        'Sell Exxon Mobil and other energy stocks before these headwinds hit prices once again: ',
    'entityType': 'NewsItem',
    'hashTags': ['#ExxonMobil'],
    'id': 'f83c0c39-8cc3-5b0e-91de-61e829ea65dc',
    'itemGuid':
        'sell-exxon-mobil-and-other-energy-stocks-before-these-headwinds-once-again-hit-prices-11656527286?siteid=yhoof2&yptr=yahoo',
    'linkUrl':
        'https://www.marketwatch.com/story/sell-exxon-mobil-and-other-energy-stocks-before-these-headwinds-once-again-hit-prices-11656527286?siteid=yhoof2&yptr=yahoo',
    'locale': 'Locale.en_US',
    'name':
        'Sell Exxon Mobil and other energy stocks before these headwinds hit prices once again',
    'publicationDate': '2022-06-29T18:28:00.000Z',
    'publisher': {
      'linkUrl': 'http://www.marketwatch.com/',
      'title': 'MarketWatch'
    },
    'timestamp': 1656630576652
  },
  'fa2b9c9e-096e-5e27-917a-26badeabff83': {
    'avatarImageUrl':
        'https://firebasestorage.googleapis.com/v0/b/buysellhold-322d1.appspot.com/o/logos%2FTSLA%3AXNGS.png?alt=media&token=c365db47-9482-4237-9267-82f72854d161',
    'description':
        'Tesla Pauses Plants After Ending Shaky Quarter With a Production Milestone',
    'entityType': 'NewsItem',
    'hashTags': ['#Tesla'],
    'id': 'fa2b9c9e-096e-5e27-917a-26badeabff83',
    'itemGuid': 'natural-gas-soars-700-becoming-040106114.html',
    'linkUrl':
        'https://finance.yahoo.com/news/natural-gas-soars-700-becoming-040106114.html',
    'locale': 'Locale.en_US',
    'name': 'Natural Gas Soars 700% Becoming Driving Force in the New Cold War',
    'publicationDate': '2022-07-05T04:01:06.000Z',
    'publisher': {
      'linkUrl': 'https://www.bloomberg.com/',
      'title': 'Bloomberg'
    },
    'timestamp': 1657002625523
  }
};

/// A dummy asynchronous term dictionary repository with simulated latency on
/// read/write operations to the [dictionary] and [postings].
///
/// Use for testing and examples.
class _TestIndexRepository {
  //

  /// The [Dictionary] instance that is the data-store for the index's term
  /// dictionary
  final Dictionary dictionary = {};

  /// The [Dictionary] instance that is the data-store for the index's term
  /// dictionary
  final Postings postings = {};

  final KGramIndex kGramIndex = {};

  /// Returns a subset of [postings] corresponding to [terms].
  ///
  /// Simulates latency of 100 uS per term in [terms].
  Future<Postings> getPostings(Iterable<String> terms) async {
    final Postings retVal = {};
    for (final term in terms) {
      final entry = postings[term];
      if (entry != null) {
        retVal[term] = entry;
      }
    }
    await Future.delayed(Duration(milliseconds: (terms.length / 10).floor()));
    return retVal;
  }

  /// Adds/overwrites the [values] to [dictionary].
  ///
  /// Simulates latency of 100 uS  per entry.
  Future<void> upsertDictionary(Dictionary values) async {
    /// Simulate latency of 100 uS  per entry.
    await Future.delayed(Duration(milliseconds: (values.length / 10).floor()));
    dictionary.addAll(values);
  }

  /// Adds/overwrites the [values] to [postings].
  ///
  /// Simulates latency of 100 uS  per entry.
  Future<void> upsertPostings(Postings values) async {
    /// Simulate write latency of 100 uS  per entry.
    await Future.delayed(Duration(milliseconds: (values.length / 10).floor()));
    postings.addAll(values);
  }

  /// Returns a subset of [dictionary] corresponding to [terms].
  ///
  /// Simulates latency of 100 uS  per term in [terms].
  Future<Dictionary> getDictionary([Iterable<String>? terms]) async {
    terms = terms ?? kGramIndex.keys;
    final Dictionary retVal = {};
    for (final term in terms) {
      final entry = dictionary[term];
      if (entry != null) {
        retVal[term] = entry;
      }
    }
    await Future.delayed(Duration(milliseconds: ((terms.length / 10).floor())));
    return retVal;
  }

  /// Returns a subset of [kGramIndex] corresponding to [kGrams].
  ///
  /// Simulates latency of 100 uS  per entry.
  Future<KGramIndex> getKGramIndex([Iterable<KGram>? kGrams]) async {
    kGrams = kGrams ?? kGramIndex.keys;
    final KGramIndex retVal = {};
    for (final kGram in kGrams) {
      final entry = kGramIndex[kGram];
      if (entry != null) {
        retVal[kGram] = entry;
      }
    }
    await Future.delayed(
        Duration(milliseconds: ((kGrams.length / 10).floor())));
    return retVal;
  }

  Future<void> upsertKGramIndex(KGramIndex values) async =>
      kGramIndex.addAll(values);

  /// Returns the length of the [dictionary].
  ///
  /// Simulate a read latency of 5 milliseconds.
  Future<Ft> get vocabularyLength async {
    /// Simulate a read latency of 5 milliseconds.
    await Future.delayed(const Duration(milliseconds: 5));
    return dictionary.length;
  }
}
5
likes
0
pub points
49%
popularity

Publisher

verified publishergmconsult.com.au

Dart library for creating an inverted index on a collection of text documents.

Homepage
Repository (GitHub)
View/report issues

License

unknown (license)

Dependencies

collection, meta, rxdart, text_analysis

More

Packages that depend on text_indexing