using System; using System.Collections.Generic; using System.Text; using System.Globalization; namespace ScrewTurn.Wiki.SearchEngine { /// /// Implements useful methods. /// public static class Tools { #region Main Search Algorithms /// /// Performs a search in the index. /// /// The search query. /// The document type tags to include in the search. /// true to apply the filter on the document type. /// The search options. /// An object that is able to fetch words. /// The results. /// If or are null. /// If is empty. /// If is true and is null. /// If is true and is empty. public static SearchResultCollection SearchInternal(string query, string[] documentTypeTags, bool filterDocumentType, SearchOptions options, IWordFetcher fetcher) { if(query == null) throw new ArgumentNullException("query"); if(query.Length == 0) throw new ArgumentException("Query cannot be empty", "query"); if(filterDocumentType && documentTypeTags == null) throw new ArgumentNullException("documentTypeTags"); if(filterDocumentType && documentTypeTags.Length == 0) throw new ArgumentException("documentTypeTags cannot be empty", "documentTypeTags"); if(fetcher == null) throw new ArgumentNullException("fetcher"); SearchResultCollection results = new SearchResultCollection(); query = query.ToLowerInvariant(); string[] queryWords = query.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); float totalRelevance = 0; Word word = null; foreach(string q in queryWords) { if(fetcher.TryGetWord(q, out word)) { foreach(IDocument doc in word.Occurrences.Keys) { // Skip documents with excluded tags if(filterDocumentType && !IsDocumentTypeTagIncluded(doc.TypeTag, documentTypeTags)) continue; foreach(BasicWordInfo info in word.Occurrences[doc]) { // If a search result is already present, add a new match to it, // otherwise create a new search result object WordInfo mi = new WordInfo(q, info.FirstCharIndex, info.WordIndex, info.Location); SearchResult res = results.GetSearchResult(doc); if(res == null) { res = new SearchResult(doc); res.Relevance.SetValue(info.Location.RelativeRelevance); res.Matches.Add(mi); results.Add(res); } else { // Avoid adding duplicate matches (happens when query contains the same word multiple times) if(!res.Matches.ContainsOccurrence(mi.Text, mi.FirstCharIndex)) { res.Matches.Add(mi); } res.Relevance.SetValue(res.Relevance.Value + info.Location.RelativeRelevance); } totalRelevance += info.Location.RelativeRelevance; } } } } if(options == SearchOptions.AllWords) { totalRelevance -= PurgeResultsForAllWords(results, queryWords); } else if(options == SearchOptions.ExactPhrase) { totalRelevance -= PurgeResultsForExactPhrase(results, queryWords); } else if(options == SearchOptions.AtLeastOneWord) { // Nothing to do } else throw new InvalidOperationException("Unsupported SearchOptions"); // Finalize relevance values for(int i = 0; i < results.Count; i++) { results[i].Relevance.Finalize(totalRelevance); } return results; } /// /// Purges the invalid results when SearchOptions is AllWords. /// /// The results to purge. /// The query words. /// The relevance value of the removed matches. public static float PurgeResultsForAllWords(SearchResultCollection results, string[] queryWords) { // Remove results that do not contain all the searched words float relevanceToRemove = 0; List toRemove = new List(); foreach(SearchResult r in results) { if(r.Matches.Count < queryWords.Length) toRemove.Add(r); else { foreach(string w in queryWords) { if(!r.Matches.Contains(w)) { toRemove.Add(r); break; } } } } foreach(SearchResult r in toRemove) { results.Remove(r); relevanceToRemove += r.Relevance.Value; } return relevanceToRemove; } /// /// Purges the invalid results when SearchOptions is ExactPhrase. /// /// The results to purge. /// The query words. /// The relevance value of the removed matches. public static float PurgeResultsForExactPhrase(SearchResultCollection results, string[] queryWords) { // Remove results that do not contain the exact phrase float relevanceToRemove = 0; List toRemove = new List(); foreach(SearchResult r in results) { // Shortcut if(r.Matches.Count < queryWords.Length) toRemove.Add(r); else { // Verify that all matches are in the same order as in the query // and that their indices make up contiguous words, // re-iterating from every word in the result, for example: // query = 'repeated content', result = 'content repeated content' // result must be tested with 'content repeated' (failing) and with 'repeated content' (succeeding) int maxTestShift = 0; if(queryWords.Length < r.Matches.Count) { maxTestShift = r.Matches.Count - queryWords.Length; } bool sequenceFound = false; for(int shift = 0; shift <= maxTestShift; shift++) { int firstWordIndex = r.Matches[shift].WordIndex; bool allOk = true; for(int i = 0; i < queryWords.Length; i++) { if(queryWords[i] != r.Matches[i + shift].Text.ToLowerInvariant() || r.Matches[i + shift].WordIndex != firstWordIndex + i) { //toRemove.Add(r); allOk = false; break; } } if(allOk) { sequenceFound = true; break; } } if(!sequenceFound) { toRemove.Add(r); } } } foreach(SearchResult r in toRemove) { results.Remove(r); relevanceToRemove += r.Relevance.Value; } return relevanceToRemove; } /// /// Determines whether a document tag is contained in a tag array. /// /// The tag to check for. /// The tag array. /// true if currentTag is contained in includedTags, false otherwise. /// The comparison is case-insensitive. public static bool IsDocumentTypeTagIncluded(string currentTag, string[] includedTags) { currentTag = currentTag.ToLowerInvariant(); foreach(string s in includedTags) { if(s.ToLowerInvariant() == currentTag) return true; } return false; } #endregion /// /// Cleans up keyworks from invalid characters. /// /// The keywords to cleanup. /// The clean keywords. public static string[] CleanupKeywords(string[] keywords) { if(keywords == null || keywords.Length == 0) return keywords; List result = new List(keywords.Length); foreach(string k in keywords) { string temp = RemoveDiacriticsAndPunctuation(k.Replace(" ", ""), true); if(temp.Length > 0) { result.Add(temp); } } return result.ToArray(); } /// /// Removes "accents" and punctuation from a string, transforming it to lowercase (culture invariant). /// /// The input string. /// A value indicating whether the input string is a single word. /// The normalized string (lowercase, culture invariant). Can be empty. public static string RemoveDiacriticsAndPunctuation(string input, bool isSingleWord) { // Code partially borrowed from: // http://weblogs.asp.net/fmarguerie/archive/2006/10/30/removing-diacritics-accents-from-strings.aspx string normalizedString = input.Normalize(NormalizationForm.FormD); StringBuilder stringBuilder = new StringBuilder(input.Length); for(int i = 0; i < normalizedString.Length; i++) { char c = normalizedString[i]; if(char.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark) { if(char.IsLetterOrDigit(c)) stringBuilder.Append(c); else if(!isSingleWord) stringBuilder.Append(" "); } } if(!isSingleWord) { while(stringBuilder.ToString().Contains(" ")) stringBuilder.Replace(" ", " "); } return stringBuilder.ToString().ToLowerInvariant().Trim(' ', '\'', '"'); } /// /// Determines whether a char is a split char. /// /// The current char. /// true if the char is a split char, false otherwise. public static bool IsSplitChar(char current) { UnicodeCategory cat = char.GetUnicodeCategory(current); // http://msdn.microsoft.com/en-us/library/system.globalization.unicodecategory.aspx // A split char is anything but the following categories return cat != UnicodeCategory.UppercaseLetter && cat != UnicodeCategory.LowercaseLetter && cat != UnicodeCategory.TitlecaseLetter && cat != UnicodeCategory.ModifierLetter && cat != UnicodeCategory.OtherLetter && cat != UnicodeCategory.NonSpacingMark && cat != UnicodeCategory.DecimalDigitNumber && cat != UnicodeCategory.LetterNumber && cat != UnicodeCategory.OtherNumber && cat != UnicodeCategory.CurrencySymbol; } /// /// Computes the index of the first non-split char given a start index. /// /// The start index. /// The content. /// The index of the first non-split char. /// If is null. public static ushort SkipSplitChars(ushort startIndex, string content) { if(content == null) throw new ArgumentNullException("content"); // startIndex < 0 is not actually a problem, so it's possible to set it to zero if(startIndex < 0) startIndex = 0; int currentIndex = startIndex; while(currentIndex < content.Length && IsSplitChar(content[currentIndex])) currentIndex++; return (ushort)currentIndex; } /// /// Tokenizes a string. /// /// The text to tokenize. /// The location of the words that are extracted. /// The tokens. /// If is null. public static WordInfo[] Tokenize(string text, WordLocation location) { if(text == null) throw new ArgumentNullException("text"); List words = new List(text.Length / 5); // Average 5 chars/word ushort currentIndex = 0, currentWordStart; // Skip all trailing splitChars currentIndex = SkipSplitChars(0, text); currentWordStart = currentIndex; while(currentIndex < text.Length && currentIndex < 65500) { while(currentIndex < text.Length && !Tools.IsSplitChar(text[currentIndex])) currentIndex++; string w = text.Substring(currentWordStart, currentIndex - currentWordStart); w = Tools.RemoveDiacriticsAndPunctuation(w, true); if(!string.IsNullOrEmpty(w)) { words.Add(new WordInfo(w, currentWordStart, (ushort)words.Count, location)); } currentIndex = SkipSplitChars((ushort)(currentIndex + 1), text); currentWordStart = currentIndex; } return words.ToArray(); } /// /// Tokenizes a string. /// /// The text to tokenize. public static WordInfo[] Tokenize(string text) { return Tokenize(text, WordLocation.Content); } /// /// Removes stop words from a set of words (case insensitive). /// /// The input words. /// The array of stop words. /// The input words without the stop words. /// If or are null. public static WordInfo[] RemoveStopWords(WordInfo[] words, string[] stopWords) { if(words == null) throw new ArgumentNullException("words"); if(stopWords == null) throw new ArgumentNullException("stopWords"); List result = new List(words.Length); foreach(WordInfo current in words) { bool found = false; foreach(string sw in stopWords) { if(string.Compare(current.Text, sw, true, CultureInfo.InvariantCulture) == 0) { found = true; break; } } if(!found) result.Add(current); } return result.ToArray(); } } /// /// Defines the interface for a component that fetches words. /// public interface IWordFetcher : IDisposable { /// /// Tries to get a word. /// /// The text of the word. /// The found word, if any, null otherwise. /// true if the word is found, false otherwise. bool TryGetWord(string text, out Word word); } }