text_indexing 0.0.1+6 copy "text_indexing: ^0.0.1+6" to clipboard
text_indexing: ^0.0.1+6 copied to clipboard

outdated

Dart library for creating an inverted index on a collection of text documents.

example/text_indexing_example.dart

// Copyright ©2022, GM Consult (Pty) Ltd
// BSD 3-Clause License
// All rights reserved

// ignore: unused_import
import 'package:text_indexing/text_indexing.dart';

/// Two examples using the indexers in this package are provided:
/// - [_inMemoryIndexerExample] is a simple example of a [InMemoryIndexer]
///   indexing the [textData] dataset; and
/// - [_persistedIndexerExample] is a simple example of a [PersistedIndexer]
///   indexing the [textData] dataset.
void main() async {
  //

  // Run a simple example of the [InMemoryIndexer] on the [textData] dataset.
  await _inMemoryIndexerExample(textData);

  //  Run a simple example of the [PersistedIndexer] on the [textData] dataset.
  await _persistedIndexerExample(textData);

  //
}

/// A simple example of the [InMemoryIndexer] on the [documents] dataset:
/// - initialize the [Dictionary];
/// - initialize the [Postings];
/// - initialize a [InMemoryIndexer];
/// - listen to the [InMemoryIndexer.postingsStream], printing the
///   emitted postings for each indexed document;
/// - iterate through the sample data;
/// - index each document, adding/updating terms in the [Dictionary]
///   and postings in the [Postings] ; and
/// - print the top 5 most popular [Dictionary.terms].
Future<void> _inMemoryIndexerExample(Map<String, String> documents) async {
  //

  // - initialize the [Dictionary]
  final dictionary = <String, int>{};

  // - initialize the [Postings]
  final postings = <String, Map<String, List<int>>>{};

  // - initialize a [InMemoryIndexer]
  final indexer = InMemoryIndexer(dictionary: dictionary, postings: postings);

  indexer.postingsStream.listen((event) {
    if (event.isNotEmpty) {
      final docId = event.first.docId;
      final terms = event.map((e) => e.term).toList();
      print('$docId: $terms');
    }
  });

  // - iterate through the sample data
  await Future.forEach(documents.entries, (MapEntry<String, String> doc) async {
    // - index each document
    await indexer.index(doc.key, doc.value);
  });

  // wait for stream elements to complete printing
  await Future.delayed(const Duration(milliseconds: 250));

  // print the 5 most popuplar terms with their frequencies
  var terms = dictionary.toList(TermSortStrategy.byFrequency);
  if (terms.length > 5) {
    terms = terms.sublist(0, 5);
  }
  for (final term in terms) {
    print('${term.term}: ${term.frequency}');
  }
}

/// A simple test of the [PersistedIndexer] on a small dataset using a
/// simulated persisted index repository with 50 millisecond latency on
/// read/write operations to [Dictionary] and [Postings] hashmaps:
/// - initialize the [_TestIndex()];
/// - initialize a [PersistedIndexer];
/// - listen to the [PersistedIndexer.postingsStream], printing the
///   emitted postings for each indexed document;
/// - iterate through the sample data;
/// - index each document, adding/updating terms in the [_TestIndex.dictionary]
///   and postings in the [_TestIndex.postings] ; and
/// - print the top 5 most popular [_TestIndex.dictionary.terms].
Future<void> _persistedIndexerExample(Map<String, String> documents) async {
  //

  // - initialize a [_TestIndex()]
  final index = _TestIndex();

  // - initialize a [InMemoryIndexer]
  final indexer = PersistedIndexer(
      termsLoader: index.loadTerms,
      dictionaryUpdater: index.updateDictionary,
      postingsLoader: index.loadTermPostings,
      postingsUpdater: index.upsertTermPostings);

  indexer.postingsStream.listen((event) {
    if (event.isNotEmpty) {
      final docId = event.first.docId;
      final terms = event.map((e) => e.term).toList();
      print('$docId: $terms');
    }
  });

  // - iterate through the sample data
  await Future.forEach(documents.entries, (MapEntry<String, String> doc) async {
    // - index each document
    await indexer.index(doc.key, doc.value);
  });

  // wait for stream elements to complete printing
  await Future.delayed(const Duration(milliseconds: 250));

  // print the 5 most popuplar terms with their frequencies
  var terms = index.dictionary.toList(TermSortStrategy.byFrequency);
  if (terms.length > 5) {
    terms = terms.sublist(0, 5);
  }
  for (final term in terms) {
    print('${term.term}: ${term.frequency}');
  }
}

/// Four paragraphs of text used for testing.
///
/// Includes numbers, currencies, abbreviations, hyphens and identifiers
final textData = {
  'doc000': 'The Dow Jones rallied even as U.S. troops were put on alert amid '
      'the Ukraine crisis. Tesla stock fought back while Apple '
      'stock struggled. ',
  'doc001': '[TSLA.XNGS] Tesla\'s #TeslaMotor Stock Is Getting Hammered.',
  'doc002': 'Among the best EV stocks to buy and watch, Tesla '
      '(TSLA.XNGS) is pulling back from new highs after a failed breakout '
      'above a \$1,201.05 double-bottom entry. ',
  'doc003': 'Meanwhile, Peloton reportedly finds an activist investor knocking '
      'on its door after a major stock crash fueled by strong indications of '
      'mismanagement. In a scathing new letter released Monday, activist '
      'Tesla Capital is pushing for Peloton to fire CEO, Chairman and '
      'founder John Foley and explore a sale.'
};

/// A dummy asynchronous term dictionary repository with 50 millisecond latency on
/// read/write operations to the [dictionary] and [postings].
///
/// Use for testing and examples.
class _TestIndex {
  //

  /// The [Dictionary] instance that is the data-store for the index's term
  /// dictionary
  final Dictionary dictionary = {};

  /// The [Dictionary] instance that is the data-store for the index's term
  /// dictionary
  final Postings postings = {};

  /// Implementation of [PostingsLoader].
  ///
  /// Returns a subset of [postings] corresponding to [terms].
  ///
  /// Simulates latency of 50 milliseconds.
  Future<Postings> loadTermPostings(Iterable<String> terms) async {
    final Postings retVal = {};
    for (final term in terms) {
      final entry = postings[term];
      if (entry != null) {
        retVal[term] = entry;
      }
    }
    return retVal;
  }

  /// Implementation of [DictionaryUpdater].
  ///
  /// Adds/overwrites the [values] to [dictionary].
  ///
  /// Simulates latency of 50 milliseconds.
  Future<void> updateDictionary(Dictionary values) async {
    /// Simulate write latency of 50milliseconds.
    await Future.delayed(const Duration(milliseconds: 50));
    dictionary.addAll(values);
  }

  /// Implementation of [PostingsUpdater].
  ///
  /// Adds/overwrites the [values] to [postings].
  ///
  /// Simulates latency of 50 milliseconds.
  Future<void> upsertTermPostings(Postings values) async {
    postings.addAll(values);
  }

  /// Implementation of [DictionaryLoader].
  ///
  /// Returns a subset of [dictionary] corresponding to [terms].
  ///
  /// Simulates latency of 50 milliseconds.
  Future<Dictionary> loadTerms(Iterable<String> terms) async {
    final Dictionary retVal = {};
    for (final term in terms) {
      final entry = dictionary[term];
      if (entry != null) {
        retVal[term] = entry;
      }
    }
    return retVal;
  }
}
5
likes
0
pub points
49%
popularity

Publisher

verified publishergmconsult.com.au

Dart library for creating an inverted index on a collection of text documents.

Homepage
Repository (GitHub)
View/report issues

License

unknown (license)

Dependencies

meta, porter_2_stemmer, rxdart, text_analysis

More

Packages that depend on text_indexing