text_indexing 0.14.7 text_indexing: ^0.14.7 copied to clipboard
Dart library for creating an inverted index on a collection of text documents.
example/text_indexing_example.dart
// Copyright ©2022, GM Consult (Pty) Ltd
// BSD 3-Clause License
// All rights reserved
// ignore_for_file: deprecated_member_use_from_same_package
import 'package:text_indexing/src/_index.dart';
/// Two examples using the indexers in this package are provided:
/// - [_inMemoryIndexerExample] is a simple example of a [TextIndexer.inMemory]
/// indexing the [textData] dataset; and
/// - [_asyncIndexerExample] is a simple example of a [TextIndexer.async]
/// indexing the [textData] dataset.
void main() async {
//
const searchPhrase = 'stock market tesla EV battery';
const zones = {
'name': 1.0,
'description': 0.5,
'hashTag': 2.0,
'publicationDate': 0.1
};
// Run a simple example of the [InMemoryIndexer] on the [textData] dataset.
await _inMemoryIndexerExample(textData, searchPhrase, zones);
// Run a simple example of the [AsyncIndexer] on the [textData] dataset.
await _asyncIndexerExample(jsonData, searchPhrase, zones);
//
}
/// A simple example of the [TextIndexer.inMemory] on the [documents] dataset:
/// - initialize the [Dictionary];
/// - initialize the [Postings];
/// - initialize a [TextIndexer];
/// - iterate through the sample data, indexing each document in turn; and
/// - print the top 5 most popular [Dictionary.terms].
Future<void> _inMemoryIndexerExample(Map<String, String> documents,
String searchPhrase, Map<String, double> zones) async {
//
// - initialize the [Dictionary]
final Dictionary dictionary = {};
// - initialize the [Postings]
final Postings postings = {};
// - initialize the [KGramIndex]
final KGramIndex kGramIndex = {};
// - initialize the index
final index = InMemoryIndex(
tokenizer: TextTokenizer(),
dictionary: dictionary,
postings: postings,
kGramIndex: kGramIndex,
zones: zones,
phraseLength: 2,
k: 3);
// - initialize a TextIndexer with the index
final indexer = TextIndexer(index: index);
/// - tokenize a phrase into searh terms
final searchTerms = (await indexer.index.tokenizer.tokenize(searchPhrase));
// - iterate through the sample data
await Future.forEach(documents.entries, (MapEntry<String, String> doc) async {
// - index each document
await indexer.indexText(doc.key, doc.value);
});
// print the statistics for each term in [searchTerms].
await _printTermStats(indexer.index, searchTerms);
}
/// A simple test of the [TextIndexer.async] on a small Map<String, dynamic> dataset using a
/// simulated persisted index repository with 50 millisecond latency on
/// read/write operations to [Dictionary] and [Postings] hashmaps:
/// - initialize the [_TestIndexRepository];
/// - initialize a [TextIndexer];
/// - iterate through the Map<String, dynamic> documents and index each document; and
/// - print the statistics on 5 search terms.
Future<void> _asyncIndexerExample(Map<String, Map<String, dynamic>> documents,
String searchPhrase, Map<String, double> zones) async {
//
// initialize a Set for the vocabulary state
final termsSet = <String>{};
// initialize a Set for the document ids state
final docsSet = <String>{};
// - initialize a [_TestIndexRepository()]
final repository = _TestIndexRepository();
final index = AsyncCallbackIndex(
dictionaryLoader: repository.getDictionary,
dictionaryUpdater: repository.upsertDictionary,
dictionaryLengthLoader: () => repository.vocabularyLength,
kGramIndexLoader: repository.getKGramIndex,
kGramIndexUpdater: repository.upsertKGramIndex,
postingsLoader: repository.getPostings,
postingsUpdater: repository.upsertPostings,
zones: zones,
k: 3,
phraseLength: 2,
tokenizer: TextTokenizer());
/// - tokenize a phrase into searh terms
final searchTerms = (await index.tokenizer.tokenize(searchPhrase));
// - initialize a [AsyncIndexer]
final indexer = TextIndexer(index: index);
print('Indexed ${termsSet.length} terms from ${docsSet.length} documents.');
// - iterate through the sample data
await indexer.indexCollection(jsonData);
// wait for stream elements to complete printing
await Future.delayed(const Duration(milliseconds: 250));
// print the statistics for each term in [searchTerms].
await _printTermStats(index, searchTerms);
}
/// Print the statistics for each term in [searchTerms].
Future<void> _printTermStats(
InvertedIndex index, Iterable<Token> tokens) async {
//
// convert the tokens to terms
final searchTerms = tokens.terms.toSet();
// convert the tokens to k-grams
final searchKgrams = tokens.kGrams(3);
// get the start time in milliseconds
final start = DateTime.now().millisecondsSinceEpoch;
// retrieve the k-gram index for searchKgrams.keys from the index
final kGramMap = await index.getKGramIndex(searchKgrams.keys);
// map all the terms for the k-grams
final kGramTerms = <String>{};
for (final terms in kGramMap.values) {
kGramTerms.addAll(terms);
}
// add the k-gram terms to the searchTerms
searchTerms.addAll(kGramTerms);
// get the inverse term frequency index for the searchTerms
final iDftIndex = await index.getIdFtIndex(searchTerms);
// get the term frequency in the corpus of the searchTerms
final tFtIndex = await index.getTfIndex(searchTerms);
// get the dictionary for searchTerms
final dictionary = await index.getDictionary(searchTerms);
// get the end time in milliseconds
final end = DateTime.now().millisecondsSinceEpoch;
// calculate the time taken to query the index in milliseconds
final dT = ((end - start)).toStringAsFixed(3);
// print the headings
print(''.padLeft(85, '_'));
print('DICTIONARY STATISTICS (search terms)');
print(''.padLeft(85, '-'));
print('${'Term'.padRight(15)}'
'${'Term Frequency'.toString().padLeft(20)}'
'${'Document Frequency'.toString().padLeft(20)}'
'${'Inverse Document Frequency'.toString().padLeft(30)}');
print(''.padLeft(85, '-'));
// print the statistics
for (final term in searchTerms) {
final df = dictionary[term] ?? 0;
final idf = iDftIndex[term] ?? 0.0;
final tf = tFtIndex[term] ?? 0;
print('${term.padRight(15)}'
'${tf.toString().padLeft(20)}'
'${df.toString().padLeft(20)}'
'${idf.toStringAsFixed(2).padLeft(4, '0').padLeft(30)}');
}
print(''.padLeft(85, '-'));
// print the performance
print('Retrieved');
print('- dictionary for ${dictionary.length} terms;');
print('- term frequencies for ${tFtIndex.length} terms;');
print('- inverse document frequencies for ${iDftIndex.length} terms; and');
print('- k-gram postings for ${kGramTerms.length} terms.');
print('in $dT milliseconds.');
print(''.padLeft(85, '-'));
print(''.padLeft(85, '-'));
}
/// Four paragraphs of text used for testing.
///
/// Includes numbers, currencies, abbreviations, hyphens and identifiers
final textData = {
'doc000': 'The Dow Jones rallied even as U.S. troops were put on alert amid '
'the Ukraine crisis. Tesla stock fought back while Apple '
'stock struggled. ',
'doc001': '[TSLA.XNGS] Tesla\'s #TeslaMotor Stock Is Getting Hammered.',
'doc002': 'Among the best EV stocks to buy and watch, Tesla '
'(TSLA.XNGS) is pulling back from new highs after a failed breakout '
'above a \$1,201.05 double-bottom entry. ',
'doc003': 'Meanwhile, Peloton reportedly finds an activist investor knocking '
'on its door after a major stock crash fueled by strong indications of '
'mismanagement. In a scathing new letter released Monday, activist '
'Tesla Capital is pushing for Peloton to fire CEO, Chairman and '
'founder John Foley and explore a sale.'
};
/// Map<String, dynamic> data used to demonstrate persisted indexing of fields in Map<String, dynamic> documents.
final jsonData = {
'ee1760a1-a259-50dc-b11d-8baf34d7d1c5': {
'avatarImageUrl':
'https://firebasestorage.googleapis.com/v0/b/buysellhold-322d1.appspot.com/o/logos%2FTSLA%3AXNGS.png?alt=media&token=c365db47-9482-4237-9267-82f72854d161',
'description':
'A 20-for-1 stock split gave a nice short-term boost to Amazon (AMZN) - Get Amazon.com Inc. Report in late May and in early June, while Alphabet (GOOGL) - Get Alphabet Inc. Report (GOOG) - Get Alphabet Inc. Report has a planned 20-for-1 stock split for next month. Tesla (TSLA) - Get Tesla Inc. Report is also waiting on shareholder approval for a 3-for-1 stock split. ',
'entityType': 'NewsItem',
'hashTags': ['#Tesla'],
'id': 'ee1760a1-a259-50dc-b11d-8baf34d7d1c5',
'itemGuid':
'trading-shopify-stock-ahead-of-10-for-1-stock-split-technical-analysis-june-2022?puc=yahoo&cm_ven=YAHOO&yptr=yahoo',
'linkUrl':
'https://www.thestreet.com/investing/trading-shopify-stock-ahead-of-10-for-1-stock-split-technical-analysis-june-2022?puc=yahoo&cm_ven=YAHOO&yptr=yahoo',
'locale': 'Locale.en_US',
'name': 'Shopify Stock Split What the Charts Say Ahead of 10-for-1 Split',
'publicationDate': '2022-06-28T17:44:00.000Z',
'publisher': {
'linkUrl': 'http://www.thestreet.com/',
'title': 'TheStreet com'
},
'timestamp': 1656464362162
},
'ee1d9610-b902-53e6-8264-840bd403365b': {
'avatarImageUrl':
'https://firebasestorage.googleapis.com/v0/b/buysellhold-322d1.appspot.com/o/logos%2FO%3AXNYS.png?alt=media&token=15b5e8fe-bec2-4711-b0f7-e5a631287e9e',
'description': 'OR',
'entityType': 'NewsItem',
'hashTags': ['#RealtyIncome'],
'id': 'ee1d9610-b902-53e6-8264-840bd403365b',
'itemGuid': 'auddev&yptr=yahoo',
'linkUrl':
'https://www.ft.com/cms/s/0ea7bcc1-d3a6-4897-8da3-98798c3be487,s01=1.html?ftcamp=traffic/partner/feed_headline/us_yahoo/auddev&yptr=yahoo',
'locale': 'Locale.en_US',
'name': 'History says US stock market has further to fall',
'publicationDate': '2022-06-25T12:35:45.000Z',
'publisher': {'linkUrl': 'http://ft.com/', 'title': 'Financial Times'},
'timestamp': 1656193158270
},
'ef3b0cb6-0297-502b-bd77-283246bc0014': {
'avatarImageUrl':
'https://firebasestorage.googleapis.com/v0/b/buysellhold-322d1.appspot.com/o/logos%2FJPM-C%3AXNYS.png?alt=media&token=deba8c6d-019e-4d49-9faa-d641a3cf1986',
'description':
'JPMorgan Sees ‘Stratospheric’ \$380 Oil on Worst-Case Russian Cut',
'entityType': 'NewsItem',
'hashTags': ['#JPMorganChase'],
'id': 'ef3b0cb6-0297-502b-bd77-283246bc0014',
'itemGuid': 'germany-risks-cascade-utility-failures-194853753.html',
'linkUrl':
'https://finance.yahoo.com/news/germany-risks-cascade-utility-failures-194853753.html',
'locale': 'Locale.en_US',
'name': 'Germany Risks a Cascade of Utility Failures Economy Chief Says',
'publicationDate': '2022-07-02T19:48:53.000Z',
'publisher': {
'linkUrl': 'https://www.bloomberg.com/',
'title': 'Bloomberg'
},
'timestamp': 1656802194091
},
'f1064cca-bf6d-5689-900b-ecb8769fe30b': {
'avatarImageUrl':
'https://firebasestorage.googleapis.com/v0/b/buysellhold-322d1.appspot.com/o/logos%2FINTC%3AXNGS.png?alt=media&token=cfefaa0a-7f06-42f8-a316-954ded1fd703',
'description':
'Under CEO Pat Gelsinger, Intel has committed to massively increasing its capital spending investments by tens of billions of dollars. But with a rapidly slowing global economy, repeated product delays, rising competitive threats, and political uncertainty, it might need more help to fund its ambitions. One good place to find it would be Intel (ticker: INTC) generous dividend payout. While the chip maker has paid a consistent dividend for three decades straight, it needs to do whatever it takes to shore up its future—up to and including cutting its dividend. Under CEO Pat Gelsinger, Intel has committed to massively increasing its capital spending investments by tens of billions of dollars.',
'entityType': 'NewsItem',
'hashTags': ['#Intel'],
'id': 'f1064cca-bf6d-5689-900b-ecb8769fe30b',
'itemGuid':
'intel-stock-dividend-future-51656450364?siteid=yhoof2&yptr=yahoo',
'linkUrl':
'https://www.barrons.com/articles/intel-stock-dividend-future-51656450364?siteid=yhoof2&yptr=yahoo',
'locale': 'Locale.en_US',
'name':
'Intel Should Slash Its Dividend The Chip Maker s Future May Depend on It',
'publicationDate': '2022-06-29T12:00:00.000Z',
'publisher': {'linkUrl': 'http://www.barrons.com/', 'title': 'Barrons com'},
'timestamp': 1656540522708
},
'f2fa8eea-6259-5c83-8865-e0b7f80e691d': {
'avatarImageUrl':
'https://firebasestorage.googleapis.com/v0/b/buysellhold-322d1.appspot.com/o/logos%2FINTC%3AXNGS.png?alt=media&token=cfefaa0a-7f06-42f8-a316-954ded1fd703',
'description':
'Consumers are being hit by the run-up in gasoline, diesel and other oil products, Mike Muller, head of Asia at Vitol Group, said Sunday on a podcast produced by Dubai-based Gulf Intelligence.',
'hashTags': ['#Intel', '#JPMorganChase'],
'id': 'f2fa8eea-6259-5c83-8865-e0b7f80e691d',
'itemGuid': 'surging-fuel-costs-causing-demand-100930872.html',
'linkUrl':
'https://finance.yahoo.com/news/surging-fuel-costs-causing-demand-100930872.html',
'locale': 'Locale.en_US',
'name': 'Surging Fuel Costs Are Causing Demand Destruction Says Vitol',
'publicationDate': '2022-07-03T10:09:30.000Z',
'publisher': {
'linkUrl': 'https://www.bloomberg.com/',
'title': 'Bloomberg'
},
'timestamp': 1656881075901
},
'f6ee5edf-094f-5892-9b37-71b8f2e90d03': {
'avatarImageUrl':
'https://firebasestorage.googleapis.com/v0/b/buysellhold-322d1.appspot.com/o/logos%2FAAPL%3AXNGS.png?alt=media&token=fb44cde6-4552-42e7-b1b0-7eddc92b1dfc',
'description':
'The Dow Jones Industrial Average rallied out of the red after the latest Fed minutes were released. EV stock Rivian (RIVN) soared on guidance even as Tesla (TSLA) fell. Microsoft (MSFT) and Apple (AAPL) were among the top blue chips.',
'entityType': 'NewsItem',
'hashTags': ['#Apple', '#Tesla'],
'id': 'f6ee5edf-094f-5892-9b37-71b8f2e90d03',
'itemGuid': '?src=A00220&yptr=yahoo',
'linkUrl':
'https://www.investors.com/market-trend/stock-market-today/dow-jones-rallies-as-fed-minutes-reveal-this-jerome-powell-ev-stock-explodes-on-guidance-tesla-stock-apple-stock-pops/?src=A00220&yptr=yahoo',
'locale': 'Locale.en_US',
'name':
'Dow Jones Rallies As Fed Minutes Reveal This EV Stock Explodes On Guidance Apple Stock Vaults',
'publicationDate': '2022-07-06T19:15:45.000Z',
'publisher': {
'linkUrl': 'http://www.investors.com/',
'title': "Investor's Business Daily"
},
'timestamp': 1657158311943
},
'f83c0c39-8cc3-5b0e-91de-61e829ea65dc': {
'avatarImageUrl':
'https://firebasestorage.googleapis.com/v0/b/buysellhold-322d1.appspot.com/o/logos%2FXOM%3AXNYS.png?alt=media&token=c94499b0-5937-47ea-8f89-94d65d3ed065',
'description':
'Sell Exxon Mobil and other energy stocks before these headwinds hit prices once again: ',
'entityType': 'NewsItem',
'hashTags': ['#ExxonMobil'],
'id': 'f83c0c39-8cc3-5b0e-91de-61e829ea65dc',
'itemGuid':
'sell-exxon-mobil-and-other-energy-stocks-before-these-headwinds-once-again-hit-prices-11656527286?siteid=yhoof2&yptr=yahoo',
'linkUrl':
'https://www.marketwatch.com/story/sell-exxon-mobil-and-other-energy-stocks-before-these-headwinds-once-again-hit-prices-11656527286?siteid=yhoof2&yptr=yahoo',
'locale': 'Locale.en_US',
'name':
'Sell Exxon Mobil and other energy stocks before these headwinds hit prices once again',
'publicationDate': '2022-06-29T18:28:00.000Z',
'publisher': {
'linkUrl': 'http://www.marketwatch.com/',
'title': 'MarketWatch'
},
'timestamp': 1656630576652
},
'fa2b9c9e-096e-5e27-917a-26badeabff83': {
'avatarImageUrl':
'https://firebasestorage.googleapis.com/v0/b/buysellhold-322d1.appspot.com/o/logos%2FTSLA%3AXNGS.png?alt=media&token=c365db47-9482-4237-9267-82f72854d161',
'description':
'Tesla Pauses Plants After Ending Shaky Quarter With a Production Milestone',
'entityType': 'NewsItem',
'hashTags': ['#Tesla'],
'id': 'fa2b9c9e-096e-5e27-917a-26badeabff83',
'itemGuid': 'natural-gas-soars-700-becoming-040106114.html',
'linkUrl':
'https://finance.yahoo.com/news/natural-gas-soars-700-becoming-040106114.html',
'locale': 'Locale.en_US',
'name': 'Natural Gas Soars 700% Becoming Driving Force in the New Cold War',
'publicationDate': '2022-07-05T04:01:06.000Z',
'publisher': {
'linkUrl': 'https://www.bloomberg.com/',
'title': 'Bloomberg'
},
'timestamp': 1657002625523
}
};
/// A dummy asynchronous term dictionary repository with simulated latency on
/// read/write operations to the [dictionary] and [postings].
///
/// Use for testing and examples.
class _TestIndexRepository {
//
/// The [Dictionary] instance that is the data-store for the index's term
/// dictionary
final Dictionary dictionary = {};
/// The [Dictionary] instance that is the data-store for the index's term
/// dictionary
final Postings postings = {};
final KGramIndex kGramIndex = {};
/// Returns a subset of [postings] corresponding to [terms].
///
/// Simulates latency of 100 uS per term in [terms].
Future<Postings> getPostings(Iterable<String> terms) async {
final Postings retVal = {};
for (final term in terms) {
final entry = postings[term];
if (entry != null) {
retVal[term] = entry;
}
}
await Future.delayed(Duration(milliseconds: (terms.length / 10).floor()));
return retVal;
}
/// Adds/overwrites the [values] to [dictionary].
///
/// Simulates latency of 100 uS per entry.
Future<void> upsertDictionary(Dictionary values) async {
/// Simulate latency of 100 uS per entry.
await Future.delayed(Duration(milliseconds: (values.length / 10).floor()));
dictionary.addAll(values);
}
/// Adds/overwrites the [values] to [postings].
///
/// Simulates latency of 100 uS per entry.
Future<void> upsertPostings(Postings values) async {
/// Simulate write latency of 100 uS per entry.
await Future.delayed(Duration(milliseconds: (values.length / 10).floor()));
postings.addAll(values);
}
/// Returns a subset of [dictionary] corresponding to [terms].
///
/// Simulates latency of 100 uS per term in [terms].
Future<Dictionary> getDictionary([Iterable<String>? terms]) async {
terms = terms ?? kGramIndex.keys;
final Dictionary retVal = {};
for (final term in terms) {
final entry = dictionary[term];
if (entry != null) {
retVal[term] = entry;
}
}
await Future.delayed(Duration(milliseconds: ((terms.length / 10).floor())));
return retVal;
}
/// Returns a subset of [kGramIndex] corresponding to [kGrams].
///
/// Simulates latency of 100 uS per entry.
Future<KGramIndex> getKGramIndex([Iterable<KGram>? kGrams]) async {
kGrams = kGrams ?? kGramIndex.keys;
final KGramIndex retVal = {};
for (final kGram in kGrams) {
final entry = kGramIndex[kGram];
if (entry != null) {
retVal[kGram] = entry;
}
}
await Future.delayed(
Duration(milliseconds: ((kGrams.length / 10).floor())));
return retVal;
}
Future<void> upsertKGramIndex(KGramIndex values) async =>
kGramIndex.addAll(values);
/// Returns the length of the [dictionary].
///
/// Simulate a read latency of 5 milliseconds.
Future<Ft> get vocabularyLength async {
/// Simulate a read latency of 5 milliseconds.
await Future.delayed(const Duration(milliseconds: 5));
return dictionary.length;
}
}