using System; using System.Collections.Generic; using System.Text; using System.Globalization; namespace ScrewTurn.Wiki.SearchEngine { /// /// Represents a word in a document. /// /// All instance and static members are thread-safe. public class Word { /// /// The word text, lowercase. /// protected string text; /// /// The occurrences. /// protected OccurrenceDictionary occurrences; /// /// The word unique ID. /// protected uint id; /// /// Initializes a new instance of the class. /// /// The word ID. /// The text of the word (lowercase). /// If is null. /// If is empty. public Word(uint id, string text) : this(id, text, new OccurrenceDictionary(10)) { } /// /// Initializes a new instance of the class. /// /// The word ID. /// The text of the word (lowercase). /// The occurrences. /// If or are null. /// If is empty. public Word(uint id, string text, OccurrenceDictionary occurrences) { if(text == null) throw new ArgumentNullException("text"); if(text.Length == 0) throw new ArgumentException("Text must contain at least one character", "text"); if(occurrences == null) throw new ArgumentNullException("occurrences"); this.text = Tools.RemoveDiacriticsAndPunctuation(text, true); //if(this.text.Length == 0) throw new InvalidOperationException(); this.id = id; this.occurrences = occurrences; } /// /// Gets or sets the unique ID of the word. /// public uint ID { get { lock(this) { return id; } } set { lock(this) { id = value; } } } /// /// Gets the text of the word (lowercase). /// public string Text { get { // Read-only: no need to lock return text; } } /// /// Gets the occurrences. /// public OccurrenceDictionary Occurrences { get { lock(occurrences) { return occurrences; } } } /// /// Gets the total occurrences. /// /// Computing the result is O(n), where n is the number of /// documents the word occurs in at least one time. public int TotalOccurrences { get { int count = 0; lock(occurrences) { foreach(KeyValuePair pair in occurrences) { count += pair.Value.Count; } } return count; } } /// /// Stores an occurrence. /// /// The document the occurrence is referred to. /// The index of the first character of the word in the document. /// The index of the word in the document. /// The location of the word. /// Adding an occurrence is O(n), where n is the number of occurrences /// of the word already stored for the same document. If there were no occurrences previously stored, /// the operation is O(1). /// If is null. /// If or are less than zero. public void AddOccurrence(IDocument document, ushort firstCharIndex, ushort wordIndex, WordLocation location) { if(document == null) throw new ArgumentNullException("document"); if(firstCharIndex < 0) throw new ArgumentOutOfRangeException("firstCharIndex", "Invalid first char index: must be greater than or equal to zero"); if(wordIndex < 0) throw new ArgumentOutOfRangeException("wordIndex", "Invalid word index: must be greater than or equal to zero"); lock(occurrences) { if(occurrences.ContainsKey(document)) { // Existing document occurrences[document].Add(new BasicWordInfo(firstCharIndex, wordIndex, location)); } else { // New document SortedBasicWordInfoSet set = new SortedBasicWordInfoSet(); set.Add(new BasicWordInfo(firstCharIndex, wordIndex, location)); occurrences.Add(document, set); } } } /// /// Removes all the occurrences for a document. /// /// The document to remove the occurrences of. /// The dumped word mappings for the removed word occurrences. /// Removing the occurrences for the document is O(1). /// If is null. public List RemoveOccurrences(IDocument document) { if(document == null) throw new ArgumentNullException("document"); lock(occurrences) { if(occurrences.ContainsKey(document)) return occurrences.RemoveExtended(document, ID); else return new List(); } } /// /// Adds a bulk of occurrences of the word in a document, removing all the old positions, if any. /// /// The document. /// The positions. /// If positions is empty, the effect of the invocation of the method is equal to /// that of with the same document. /// Bulk-adding the occurrences is O(1). /// If or are null. public void BulkAddOccurrences(IDocument document, SortedBasicWordInfoSet positions) { if(document == null) throw new ArgumentNullException("document"); if(positions == null) throw new ArgumentNullException("positions"); lock(occurrences) { if(occurrences.ContainsKey(document)) { if(positions.Count == 0) RemoveOccurrences(document); else occurrences[document] = positions; } else occurrences.Add(document, positions); } } /// /// Gets a string representation of the current instance. /// /// The string representation. public override string ToString() { return string.Format("{0} [x{1}]", text, TotalOccurrences); } } }