using System;
using System.Collections.Generic;
using System.Text;
using System.Globalization;
namespace ScrewTurn.Wiki.SearchEngine {
///
/// Implements a base class for the search index.
///
/// All instance and static members are thread-safe.
public abstract class InMemoryIndexBase : IInMemoryIndex {
///
/// The stop words to be used while indexing new content.
///
protected string[] stopWords = null;
///
/// Contains the index catalog.
///
protected Dictionary catalog = null;
///
/// The delegate.
///
protected BuildDocument buildDocument = null;
///
/// An event fired when the index is changed.
///
public event EventHandler IndexChanged;
///
/// Sets the delegate used for converting a to an instance of a class implementing ,
/// while reading index data from a permanent storage.
///
/// The delegate (cannot be null).
/// This method must be called before invoking .
/// If is null.
public void SetBuildDocumentDelegate(BuildDocument buildDocument) {
if(buildDocument == null) throw new ArgumentNullException("buildDocument");
lock(this) {
this.buildDocument = buildDocument;
}
}
///
/// Takes care of firing the event.
///
/// The affected document.
/// The change performed.
/// The dumped change data.
/// A state object that is passed to the IndexStorer SaveDate/DeleteData function.
/// The storage result or null.
protected IndexStorerResult OnIndexChange(IDocument document, IndexChangeType change, DumpedChange changeData, object state) {
if(IndexChanged != null) {
IndexChangedEventArgs args = new IndexChangedEventArgs(document, change, changeData, state);
IndexChanged(this, args);
return args.Result;
}
else return null;
}
///
/// Initializes a new instance of the class.
///
public InMemoryIndexBase() {
this.stopWords = new string[0];
this.catalog = new Dictionary(5000);
}
///
/// Gets or sets the stop words to be used while indexing new content.
///
public string[] StopWords {
get {
lock(this) {
return stopWords;
}
}
set {
if(value == null) throw new ArgumentNullException("value", "Stop words cannot be null");
lock(this) {
stopWords = value;
}
}
}
///
/// Gets the total count of unique words.
///
/// Computing the result is O(1).
public int TotalWords {
get {
lock(this) {
return catalog.Count;
}
}
}
///
/// Gets the total count of documents.
///
/// Computing the result is O(n*m), where n is the number of
/// words in the index and m is the number of documents.
public int TotalDocuments {
get {
List docs = new List(100);
lock(this) {
foreach(KeyValuePair pair in catalog) {
foreach(KeyValuePair pair2 in pair.Value.Occurrences) {
if(!docs.Contains(pair2.Key)) docs.Add(pair2.Key);
}
}
}
return docs.Count;
}
}
///
/// Gets the total number of occurrences (count of words in each document).
///
/// Computing the result is O(n),
/// where n is the number of words in the index.
public int TotalOccurrences {
get {
int count = 0;
lock(this) {
foreach(KeyValuePair pair in catalog) {
count += pair.Value.TotalOccurrences;
}
}
return count;
}
}
///
/// Completely clears the index (stop words are not affected).
///
/// A state object that is passed to the IndexStorer SaveDate/DeleteData function.
public void Clear(object state) {
lock(this) {
catalog.Clear();
OnIndexChange(null, IndexChangeType.IndexCleared, null, state);
}
}
///
/// Initializes index data by completely emptying the index catalog and storing the specified data.
///
/// The documents.
/// The words.
/// The mappings.
/// The method does not check the consistency of the data passed as arguments.
/// If , or are null.
/// If was not called.
public void InitializeData(DumpedDocument[] documents, DumpedWord[] words, DumpedWordMapping[] mappings) {
if(documents == null) throw new ArgumentNullException("documents");
if(words == null) throw new ArgumentNullException("words");
if(mappings == null) throw new ArgumentNullException("mappings");
if(buildDocument == null) throw new InvalidOperationException("InitializeData can be invoked only when the BuildDocument delegate is set");
lock(this) {
catalog.Clear();
catalog = new Dictionary(words.Length);
// Contains the IDs of documents that are missing
List missingDocuments = new List(50);
// 1. Prepare a dictionary with all documents for use in the last step
Dictionary tempDocuments = new Dictionary(documents.Length);
foreach(DumpedDocument doc in documents) {
IDocument builtDoc = buildDocument(doc);
// Null means that the document no longer exists - silently skip it
if(builtDoc != null) {
tempDocuments.Add(doc.ID, builtDoc);
}
else {
missingDocuments.Add(doc.ID);
}
}
// 2. Load words into the catalog, keeping track of them by ID in a dictionary for the next step
Dictionary tempWords = new Dictionary(words.Length);
// Test for hashing algorithm -- no more used since sequential IDs
//if(words.Length > 0 && words[0].ID != Tools.HashString(words[0].Text)) {
// throw new InvalidOperationException("The search engine index seems to use an outdated hashing algorithm");
//}
foreach(DumpedWord w in words) {
Word word = new Word(w.ID, w.Text);
/*if(tempWords.ContainsKey(w.ID)) {
string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, tempWords[w.ID]);
Console.WriteLine(t);
}*/
tempWords.Add(w.ID, word);
/*if(catalog.ContainsKey(w.Text)) {
string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, catalog[w.Text]);
Console.WriteLine(t);
}*/
catalog.Add(w.Text, word);
}
// 3. Add mappings and documents
foreach(DumpedWordMapping map in mappings) {
// HACK: Skip mappings that refer to missing documents and gracefully skip unknown words
if(!missingDocuments.Contains(map.DocumentID)) {
try {
tempWords[map.WordID].AddOccurrence(tempDocuments[map.DocumentID],
map.FirstCharIndex, map.WordIndex, WordLocation.GetInstance(map.Location));
}
catch(KeyNotFoundException) { }
}
}
}
}
///
/// Stores a document in the index.
///
/// The document.
/// The document keywords, if any, an empty array or null otherwise.
/// The content of the document.
/// A state object that is passed to the IndexStorer SaveDate/DeleteData function.
/// The number of indexed words (including duplicates) in the document title and content.
/// Indexing the content of the document is O(n),
/// where n is the total number of words in the document.
/// If the specified document was already in the index, all the old occurrences
/// are deleted from the index.
/// If or are null.
public int StoreDocument(IDocument document, string[] keywords, string content, object state) {
if(document == null) throw new ArgumentNullException("document");
if(keywords == null) keywords = new string[0];
if(content == null) throw new ArgumentNullException("content");
lock(this) {
DumpedChange removeChange = RemoveDocumentInternal(document);
if(removeChange != null) {
OnIndexChange(document, IndexChangeType.DocumentRemoved, removeChange, state);
}
}
keywords = Tools.CleanupKeywords(keywords);
// When the IndexStorer handles the IndexChanged event and a document is added, the storer generates a new ID and returns it
// via the event handler, then the in-memory index is updated (the document instance is shared across all words) - the final ID
// is generated by the actual IndexStorer implementation (SaveData properly populates the Result field in the args)
List dw = new List(content.Length / 5);
List dm = new List(content.Length / 5);
Word tempWord = null;
List newWords = new List(50);
DumpedWord tempDumpedWord = null;
int count = 0;
uint sequentialWordId = uint.MaxValue;
// Store content words
WordInfo[] words = document.Tokenize(content);
words = Tools.RemoveStopWords(words, stopWords);
foreach(WordInfo info in words) {
dm.Add(StoreWord(info.Text, document, info.FirstCharIndex, info.WordIndex, WordLocation.Content, out tempWord, out tempDumpedWord));
if(tempDumpedWord != null && tempWord != null) {
dm[dm.Count - 1].WordID = sequentialWordId;
tempDumpedWord.ID = sequentialWordId;
dw.Add(tempDumpedWord);
tempWord.ID = sequentialWordId;
newWords.Add(tempWord);
sequentialWordId--;
}
}
count += words.Length;
// Store title words
words = document.Tokenize(document.Title);
words = Tools.RemoveStopWords(words, stopWords);
foreach(WordInfo info in words) {
dm.Add(StoreWord(info.Text, document, info.FirstCharIndex, info.WordIndex, WordLocation.Title, out tempWord, out tempDumpedWord));
if(tempDumpedWord != null && tempWord != null) {
dm[dm.Count - 1].WordID = sequentialWordId;
tempDumpedWord.ID = sequentialWordId;
dw.Add(tempDumpedWord);
tempWord.ID = sequentialWordId;
newWords.Add(tempWord);
sequentialWordId--;
}
}
count += words.Length;
ushort tempCount = 0;
// Store keywords
for(ushort i = 0; i < (ushort)keywords.Length; i++) {
dm.Add(StoreWord(keywords[i], document, tempCount, i, WordLocation.Keywords, out tempWord, out tempDumpedWord));
if(tempDumpedWord != null && tempWord != null) {
dm[dm.Count - 1].WordID = sequentialWordId;
tempDumpedWord.ID = sequentialWordId;
dw.Add(tempDumpedWord);
tempWord.ID = sequentialWordId;
newWords.Add(tempWord);
sequentialWordId--;
}
tempCount += (ushort)(1 + keywords[i].Length);
}
count += keywords.Length;
IndexStorerResult result = OnIndexChange(document, IndexChangeType.DocumentAdded,
new DumpedChange(new DumpedDocument(document), dw, dm), state);
// Update document ID
if(result != null && result.DocumentID.HasValue) {
document.ID = result.DocumentID.Value;
}
else {
// HACK: result is null -> index is corrupted, silently return
return 0;
}
// Update word IDs in newWords
bool wordIdUpdated = false;
foreach(Word word in newWords) {
wordIdUpdated = false;
foreach(WordId id in result.WordIDs) {
if(id.Text == word.Text) {
word.ID = id.ID;
wordIdUpdated = true;
break;
}
}
if(!wordIdUpdated) throw new InvalidOperationException("No ID for new word");
}
return count;
}
///
/// Stores a word in the catalog.
///
/// The word to store.
/// The document the word occurs in.
/// The index of the first character of the word in the document the word occurs at.
/// The index of the word in the document.
/// The location of the word.
/// The new word, or null.
/// The dumped word data, or null.
/// The dumped word mapping data.
/// Storing a word in the index is O(n log n),
/// where n is the number of words already in the index.
protected DumpedWordMapping StoreWord(string wordText, IDocument document, ushort firstCharIndex, ushort wordIndex,
WordLocation location, out Word newWord, out DumpedWord dumpedWord) {
wordText = wordText.ToLower(CultureInfo.InvariantCulture);
lock(this) {
Word word = null;
if(!catalog.TryGetValue(wordText, out word)) {
// Use ZERO as initial ID, update when IndexStorer has stored the word
// A reference to this newly-created word must be passed outside this method
word = new Word(0, wordText);
catalog.Add(wordText, word);
newWord = word;
dumpedWord = new DumpedWord(word);
}
else {
newWord = null;
dumpedWord = null;
}
word.AddOccurrence(document, firstCharIndex, wordIndex, location);
return new DumpedWordMapping(word.ID, document.ID, firstCharIndex, wordIndex, location.Location);
}
}
///
/// Removes a document from the index.
///
/// The document to remove.
/// A state object that is passed to the IndexStorer SaveDate/DeleteData function.
/// If is null.
public void RemoveDocument(IDocument document, object state) {
if(document == null) throw new ArgumentNullException("document");
DumpedChange dc = RemoveDocumentInternal(document);
if(dc != null) {
OnIndexChange(document, IndexChangeType.DocumentRemoved, dc, state);
}
// else nothing to do
}
///
/// Finds a document with a specified name.
///
/// The name of the document.
/// The document or null.
private IDocument FindDocument(string name) {
foreach(KeyValuePair pair in catalog) {
foreach(KeyValuePair pair2 in pair.Value.Occurrences) {
if(StringComparer.OrdinalIgnoreCase.Compare(pair2.Key.Name, name) == 0) {
return pair2.Key;
}
}
}
return null;
}
///
/// Removes a document from the index and generates the dumped change data.
///
/// The document to remove.
/// The dumped change data, if any, null otherwise.
protected DumpedChange RemoveDocumentInternal(IDocument document) {
if(document == null) throw new ArgumentNullException("document");
// Find real document to remove by name
document = FindDocument(document.Name);
if(document == null) {
return null;
}
List dw = null;
List dm = new List(1500);
foreach(string w in catalog.Keys) {
dm.AddRange(catalog[w].RemoveOccurrences(document));
}
// Remove all words that have no occurrences left
List toRemove = new List(50);
foreach(string w in catalog.Keys) {
if(catalog[w].TotalOccurrences == 0) toRemove.Add(w);
}
dw = new List(toRemove.Count);
foreach(string w in toRemove) {
dw.Add(new DumpedWord(catalog[w]));
catalog.Remove(w);
}
if(dm.Count > 0 || dw.Count > 0 || document != null) {
return new DumpedChange(new DumpedDocument(document), dw, dm);
}
else return null;
}
///
/// Performs a search in the index.
///
/// The search parameters.
/// The results.
/// If is null.
public SearchResultCollection Search(SearchParameters parameters) {
if(parameters == null) throw new ArgumentNullException("parameters");
using(IWordFetcher fetcher = new InMemoryIndexWordFetcher(catalog)) {
if(parameters.DocumentTypeTags == null) {
return Tools.SearchInternal(parameters.Query, null, false, parameters.Options, fetcher);
}
else {
return Tools.SearchInternal(parameters.Query, parameters.DocumentTypeTags, true, parameters.Options, fetcher);
}
}
}
}
///
/// Implements a word fetcher for use with the in-memory index.
///
public class InMemoryIndexWordFetcher : IWordFetcher {
private Dictionary catalog;
///
/// Initializes a new instance of the class.
///
/// The index catalog.
public InMemoryIndexWordFetcher(Dictionary catalog) {
if(catalog == null) throw new ArgumentNullException("catalog");
this.catalog = catalog;
}
///
/// Tries to get a word.
///
/// The text of the word.
/// The found word, if any, null otherwise.
/// true if the word is found, false otherwise.
public bool TryGetWord(string text, out Word word) {
lock(catalog) {
return catalog.TryGetValue(text, out word);
}
}
#region IDisposable Members
///
/// Performs application-defined tasks associated with freeing, releasing, or resetting unmanaged resources.
///
public void Dispose() {
// Nothing to do
}
#endregion
}
}