using System; using System.Collections.Generic; using System.Text; using System.Globalization; namespace ScrewTurn.Wiki.SearchEngine { /// /// Implements a base class for the search index. /// /// All instance and static members are thread-safe. public abstract class InMemoryIndexBase : IInMemoryIndex { /// /// The stop words to be used while indexing new content. /// protected string[] stopWords = null; /// /// Contains the index catalog. /// protected Dictionary catalog = null; /// /// The delegate. /// protected BuildDocument buildDocument = null; /// /// An event fired when the index is changed. /// public event EventHandler IndexChanged; /// /// Sets the delegate used for converting a to an instance of a class implementing , /// while reading index data from a permanent storage. /// /// The delegate (cannot be null). /// This method must be called before invoking . /// If is null. public void SetBuildDocumentDelegate(BuildDocument buildDocument) { if(buildDocument == null) throw new ArgumentNullException("buildDocument"); lock(this) { this.buildDocument = buildDocument; } } /// /// Takes care of firing the event. /// /// The affected document. /// The change performed. /// The dumped change data. /// A state object that is passed to the IndexStorer SaveDate/DeleteData function. /// The storage result or null. protected IndexStorerResult OnIndexChange(IDocument document, IndexChangeType change, DumpedChange changeData, object state) { if(IndexChanged != null) { IndexChangedEventArgs args = new IndexChangedEventArgs(document, change, changeData, state); IndexChanged(this, args); return args.Result; } else return null; } /// /// Initializes a new instance of the class. /// public InMemoryIndexBase() { this.stopWords = new string[0]; this.catalog = new Dictionary(5000); } /// /// Gets or sets the stop words to be used while indexing new content. /// public string[] StopWords { get { lock(this) { return stopWords; } } set { if(value == null) throw new ArgumentNullException("value", "Stop words cannot be null"); lock(this) { stopWords = value; } } } /// /// Gets the total count of unique words. /// /// Computing the result is O(1). public int TotalWords { get { lock(this) { return catalog.Count; } } } /// /// Gets the total count of documents. /// /// Computing the result is O(n*m), where n is the number of /// words in the index and m is the number of documents. public int TotalDocuments { get { List docs = new List(100); lock(this) { foreach(KeyValuePair pair in catalog) { foreach(KeyValuePair pair2 in pair.Value.Occurrences) { if(!docs.Contains(pair2.Key)) docs.Add(pair2.Key); } } } return docs.Count; } } /// /// Gets the total number of occurrences (count of words in each document). /// /// Computing the result is O(n), /// where n is the number of words in the index. public int TotalOccurrences { get { int count = 0; lock(this) { foreach(KeyValuePair pair in catalog) { count += pair.Value.TotalOccurrences; } } return count; } } /// /// Completely clears the index (stop words are not affected). /// /// A state object that is passed to the IndexStorer SaveDate/DeleteData function. public void Clear(object state) { lock(this) { catalog.Clear(); OnIndexChange(null, IndexChangeType.IndexCleared, null, state); } } /// /// Initializes index data by completely emptying the index catalog and storing the specified data. /// /// The documents. /// The words. /// The mappings. /// The method does not check the consistency of the data passed as arguments. /// If , or are null. /// If was not called. public void InitializeData(DumpedDocument[] documents, DumpedWord[] words, DumpedWordMapping[] mappings) { if(documents == null) throw new ArgumentNullException("documents"); if(words == null) throw new ArgumentNullException("words"); if(mappings == null) throw new ArgumentNullException("mappings"); if(buildDocument == null) throw new InvalidOperationException("InitializeData can be invoked only when the BuildDocument delegate is set"); lock(this) { catalog.Clear(); catalog = new Dictionary(words.Length); // Contains the IDs of documents that are missing List missingDocuments = new List(50); // 1. Prepare a dictionary with all documents for use in the last step Dictionary tempDocuments = new Dictionary(documents.Length); foreach(DumpedDocument doc in documents) { IDocument builtDoc = buildDocument(doc); // Null means that the document no longer exists - silently skip it if(builtDoc != null) { tempDocuments.Add(doc.ID, builtDoc); } else { missingDocuments.Add(doc.ID); } } // 2. Load words into the catalog, keeping track of them by ID in a dictionary for the next step Dictionary tempWords = new Dictionary(words.Length); // Test for hashing algorithm -- no more used since sequential IDs //if(words.Length > 0 && words[0].ID != Tools.HashString(words[0].Text)) { // throw new InvalidOperationException("The search engine index seems to use an outdated hashing algorithm"); //} foreach(DumpedWord w in words) { Word word = new Word(w.ID, w.Text); /*if(tempWords.ContainsKey(w.ID)) { string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, tempWords[w.ID]); Console.WriteLine(t); }*/ tempWords.Add(w.ID, word); /*if(catalog.ContainsKey(w.Text)) { string t = string.Format("CURRENT: {0}, {1} --- EXISTING: {2}", w.ID, word, catalog[w.Text]); Console.WriteLine(t); }*/ catalog.Add(w.Text, word); } // 3. Add mappings and documents foreach(DumpedWordMapping map in mappings) { // HACK: Skip mappings that refer to missing documents and gracefully skip unknown words if(!missingDocuments.Contains(map.DocumentID)) { try { tempWords[map.WordID].AddOccurrence(tempDocuments[map.DocumentID], map.FirstCharIndex, map.WordIndex, WordLocation.GetInstance(map.Location)); } catch(KeyNotFoundException) { } } } } } /// /// Stores a document in the index. /// /// The document. /// The document keywords, if any, an empty array or null otherwise. /// The content of the document. /// A state object that is passed to the IndexStorer SaveDate/DeleteData function. /// The number of indexed words (including duplicates) in the document title and content. /// Indexing the content of the document is O(n), /// where n is the total number of words in the document. /// If the specified document was already in the index, all the old occurrences /// are deleted from the index. /// If or are null. public int StoreDocument(IDocument document, string[] keywords, string content, object state) { if(document == null) throw new ArgumentNullException("document"); if(keywords == null) keywords = new string[0]; if(content == null) throw new ArgumentNullException("content"); lock(this) { DumpedChange removeChange = RemoveDocumentInternal(document); if(removeChange != null) { OnIndexChange(document, IndexChangeType.DocumentRemoved, removeChange, state); } } keywords = Tools.CleanupKeywords(keywords); // When the IndexStorer handles the IndexChanged event and a document is added, the storer generates a new ID and returns it // via the event handler, then the in-memory index is updated (the document instance is shared across all words) - the final ID // is generated by the actual IndexStorer implementation (SaveData properly populates the Result field in the args) List dw = new List(content.Length / 5); List dm = new List(content.Length / 5); Word tempWord = null; List newWords = new List(50); DumpedWord tempDumpedWord = null; int count = 0; uint sequentialWordId = uint.MaxValue; // Store content words WordInfo[] words = document.Tokenize(content); words = Tools.RemoveStopWords(words, stopWords); foreach(WordInfo info in words) { dm.Add(StoreWord(info.Text, document, info.FirstCharIndex, info.WordIndex, WordLocation.Content, out tempWord, out tempDumpedWord)); if(tempDumpedWord != null && tempWord != null) { dm[dm.Count - 1].WordID = sequentialWordId; tempDumpedWord.ID = sequentialWordId; dw.Add(tempDumpedWord); tempWord.ID = sequentialWordId; newWords.Add(tempWord); sequentialWordId--; } } count += words.Length; // Store title words words = document.Tokenize(document.Title); words = Tools.RemoveStopWords(words, stopWords); foreach(WordInfo info in words) { dm.Add(StoreWord(info.Text, document, info.FirstCharIndex, info.WordIndex, WordLocation.Title, out tempWord, out tempDumpedWord)); if(tempDumpedWord != null && tempWord != null) { dm[dm.Count - 1].WordID = sequentialWordId; tempDumpedWord.ID = sequentialWordId; dw.Add(tempDumpedWord); tempWord.ID = sequentialWordId; newWords.Add(tempWord); sequentialWordId--; } } count += words.Length; ushort tempCount = 0; // Store keywords for(ushort i = 0; i < (ushort)keywords.Length; i++) { dm.Add(StoreWord(keywords[i], document, tempCount, i, WordLocation.Keywords, out tempWord, out tempDumpedWord)); if(tempDumpedWord != null && tempWord != null) { dm[dm.Count - 1].WordID = sequentialWordId; tempDumpedWord.ID = sequentialWordId; dw.Add(tempDumpedWord); tempWord.ID = sequentialWordId; newWords.Add(tempWord); sequentialWordId--; } tempCount += (ushort)(1 + keywords[i].Length); } count += keywords.Length; IndexStorerResult result = OnIndexChange(document, IndexChangeType.DocumentAdded, new DumpedChange(new DumpedDocument(document), dw, dm), state); // Update document ID if(result != null && result.DocumentID.HasValue) { document.ID = result.DocumentID.Value; } else { // HACK: result is null -> index is corrupted, silently return return 0; } // Update word IDs in newWords bool wordIdUpdated = false; foreach(Word word in newWords) { wordIdUpdated = false; foreach(WordId id in result.WordIDs) { if(id.Text == word.Text) { word.ID = id.ID; wordIdUpdated = true; break; } } if(!wordIdUpdated) throw new InvalidOperationException("No ID for new word"); } return count; } /// /// Stores a word in the catalog. /// /// The word to store. /// The document the word occurs in. /// The index of the first character of the word in the document the word occurs at. /// The index of the word in the document. /// The location of the word. /// The new word, or null. /// The dumped word data, or null. /// The dumped word mapping data. /// Storing a word in the index is O(n log n), /// where n is the number of words already in the index. protected DumpedWordMapping StoreWord(string wordText, IDocument document, ushort firstCharIndex, ushort wordIndex, WordLocation location, out Word newWord, out DumpedWord dumpedWord) { wordText = wordText.ToLower(CultureInfo.InvariantCulture); lock(this) { Word word = null; if(!catalog.TryGetValue(wordText, out word)) { // Use ZERO as initial ID, update when IndexStorer has stored the word // A reference to this newly-created word must be passed outside this method word = new Word(0, wordText); catalog.Add(wordText, word); newWord = word; dumpedWord = new DumpedWord(word); } else { newWord = null; dumpedWord = null; } word.AddOccurrence(document, firstCharIndex, wordIndex, location); return new DumpedWordMapping(word.ID, document.ID, firstCharIndex, wordIndex, location.Location); } } /// /// Removes a document from the index. /// /// The document to remove. /// A state object that is passed to the IndexStorer SaveDate/DeleteData function. /// If is null. public void RemoveDocument(IDocument document, object state) { if(document == null) throw new ArgumentNullException("document"); DumpedChange dc = RemoveDocumentInternal(document); if(dc != null) { OnIndexChange(document, IndexChangeType.DocumentRemoved, dc, state); } // else nothing to do } /// /// Finds a document with a specified name. /// /// The name of the document. /// The document or null. private IDocument FindDocument(string name) { foreach(KeyValuePair pair in catalog) { foreach(KeyValuePair pair2 in pair.Value.Occurrences) { if(StringComparer.OrdinalIgnoreCase.Compare(pair2.Key.Name, name) == 0) { return pair2.Key; } } } return null; } /// /// Removes a document from the index and generates the dumped change data. /// /// The document to remove. /// The dumped change data, if any, null otherwise. protected DumpedChange RemoveDocumentInternal(IDocument document) { if(document == null) throw new ArgumentNullException("document"); // Find real document to remove by name document = FindDocument(document.Name); if(document == null) { return null; } List dw = null; List dm = new List(1500); foreach(string w in catalog.Keys) { dm.AddRange(catalog[w].RemoveOccurrences(document)); } // Remove all words that have no occurrences left List toRemove = new List(50); foreach(string w in catalog.Keys) { if(catalog[w].TotalOccurrences == 0) toRemove.Add(w); } dw = new List(toRemove.Count); foreach(string w in toRemove) { dw.Add(new DumpedWord(catalog[w])); catalog.Remove(w); } if(dm.Count > 0 || dw.Count > 0 || document != null) { return new DumpedChange(new DumpedDocument(document), dw, dm); } else return null; } /// /// Performs a search in the index. /// /// The search parameters. /// The results. /// If is null. public SearchResultCollection Search(SearchParameters parameters) { if(parameters == null) throw new ArgumentNullException("parameters"); using(IWordFetcher fetcher = new InMemoryIndexWordFetcher(catalog)) { if(parameters.DocumentTypeTags == null) { return Tools.SearchInternal(parameters.Query, null, false, parameters.Options, fetcher); } else { return Tools.SearchInternal(parameters.Query, parameters.DocumentTypeTags, true, parameters.Options, fetcher); } } } } /// /// Implements a word fetcher for use with the in-memory index. /// public class InMemoryIndexWordFetcher : IWordFetcher { private Dictionary catalog; /// /// Initializes a new instance of the class. /// /// The index catalog. public InMemoryIndexWordFetcher(Dictionary catalog) { if(catalog == null) throw new ArgumentNullException("catalog"); this.catalog = catalog; } /// /// Tries to get a word. /// /// The text of the word. /// The found word, if any, null otherwise. /// true if the word is found, false otherwise. public bool TryGetWord(string text, out Word word) { lock(catalog) { return catalog.TryGetValue(text, out word); } } #region IDisposable Members /// /// Performs application-defined tasks associated with freeing, releasing, or resetting unmanaged resources. /// public void Dispose() { // Nothing to do } #endregion } }