using System;
using System.Collections.Generic;
using System.Text;
using System.Globalization;
namespace ScrewTurn.Wiki.SearchEngine {
///
/// Implements useful methods.
///
public static class Tools {
#region Main Search Algorithms
///
/// Performs a search in the index.
///
/// The search query.
/// The document type tags to include in the search.
/// true to apply the filter on the document type.
/// The search options.
/// An object that is able to fetch words.
/// The results.
/// If or are null.
/// If is empty.
/// If is true and is null.
/// If is true and is empty.
public static SearchResultCollection SearchInternal(string query, string[] documentTypeTags, bool filterDocumentType, SearchOptions options, IWordFetcher fetcher) {
if(query == null) throw new ArgumentNullException("query");
if(query.Length == 0) throw new ArgumentException("Query cannot be empty", "query");
if(filterDocumentType && documentTypeTags == null) throw new ArgumentNullException("documentTypeTags");
if(filterDocumentType && documentTypeTags.Length == 0) throw new ArgumentException("documentTypeTags cannot be empty", "documentTypeTags");
if(fetcher == null) throw new ArgumentNullException("fetcher");
SearchResultCollection results = new SearchResultCollection();
query = query.ToLowerInvariant();
string[] queryWords = query.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
float totalRelevance = 0;
Word word = null;
foreach(string q in queryWords) {
if(fetcher.TryGetWord(q, out word)) {
foreach(IDocument doc in word.Occurrences.Keys) {
// Skip documents with excluded tags
if(filterDocumentType &&
!IsDocumentTypeTagIncluded(doc.TypeTag, documentTypeTags)) continue;
foreach(BasicWordInfo info in word.Occurrences[doc]) {
// If a search result is already present, add a new match to it,
// otherwise create a new search result object
WordInfo mi = new WordInfo(q, info.FirstCharIndex, info.WordIndex, info.Location);
SearchResult res = results.GetSearchResult(doc);
if(res == null) {
res = new SearchResult(doc);
res.Relevance.SetValue(info.Location.RelativeRelevance);
res.Matches.Add(mi);
results.Add(res);
}
else {
// Avoid adding duplicate matches (happens when query contains the same word multiple times)
if(!res.Matches.ContainsOccurrence(mi.Text, mi.FirstCharIndex)) {
res.Matches.Add(mi);
}
res.Relevance.SetValue(res.Relevance.Value + info.Location.RelativeRelevance);
}
totalRelevance += info.Location.RelativeRelevance;
}
}
}
}
if(options == SearchOptions.AllWords) {
totalRelevance -= PurgeResultsForAllWords(results, queryWords);
}
else if(options == SearchOptions.ExactPhrase) {
totalRelevance -= PurgeResultsForExactPhrase(results, queryWords);
}
else if(options == SearchOptions.AtLeastOneWord) {
// Nothing to do
}
else throw new InvalidOperationException("Unsupported SearchOptions");
// Finalize relevance values
for(int i = 0; i < results.Count; i++) {
results[i].Relevance.Finalize(totalRelevance);
}
return results;
}
///
/// Purges the invalid results when SearchOptions is AllWords.
///
/// The results to purge.
/// The query words.
/// The relevance value of the removed matches.
public static float PurgeResultsForAllWords(SearchResultCollection results, string[] queryWords) {
// Remove results that do not contain all the searched words
float relevanceToRemove = 0;
List toRemove = new List();
foreach(SearchResult r in results) {
if(r.Matches.Count < queryWords.Length) toRemove.Add(r);
else {
foreach(string w in queryWords) {
if(!r.Matches.Contains(w)) {
toRemove.Add(r);
break;
}
}
}
}
foreach(SearchResult r in toRemove) {
results.Remove(r);
relevanceToRemove += r.Relevance.Value;
}
return relevanceToRemove;
}
///
/// Purges the invalid results when SearchOptions is ExactPhrase.
///
/// The results to purge.
/// The query words.
/// The relevance value of the removed matches.
public static float PurgeResultsForExactPhrase(SearchResultCollection results, string[] queryWords) {
// Remove results that do not contain the exact phrase
float relevanceToRemove = 0;
List toRemove = new List();
foreach(SearchResult r in results) {
// Shortcut
if(r.Matches.Count < queryWords.Length) toRemove.Add(r);
else {
// Verify that all matches are in the same order as in the query
// and that their indices make up contiguous words,
// re-iterating from every word in the result, for example:
// query = 'repeated content', result = 'content repeated content'
// result must be tested with 'content repeated' (failing) and with 'repeated content' (succeeding)
int maxTestShift = 0;
if(queryWords.Length < r.Matches.Count) {
maxTestShift = r.Matches.Count - queryWords.Length;
}
bool sequenceFound = false;
for(int shift = 0; shift <= maxTestShift; shift++) {
int firstWordIndex = r.Matches[shift].WordIndex;
bool allOk = true;
for(int i = 0; i < queryWords.Length; i++) {
if(queryWords[i] != r.Matches[i + shift].Text.ToLowerInvariant() ||
r.Matches[i + shift].WordIndex != firstWordIndex + i) {
//toRemove.Add(r);
allOk = false;
break;
}
}
if(allOk) {
sequenceFound = true;
break;
}
}
if(!sequenceFound) {
toRemove.Add(r);
}
}
}
foreach(SearchResult r in toRemove) {
results.Remove(r);
relevanceToRemove += r.Relevance.Value;
}
return relevanceToRemove;
}
///
/// Determines whether a document tag is contained in a tag array.
///
/// The tag to check for.
/// The tag array.
/// true if currentTag is contained in includedTags, false otherwise.
/// The comparison is case-insensitive.
public static bool IsDocumentTypeTagIncluded(string currentTag, string[] includedTags) {
currentTag = currentTag.ToLowerInvariant();
foreach(string s in includedTags) {
if(s.ToLowerInvariant() == currentTag) return true;
}
return false;
}
#endregion
///
/// Cleans up keyworks from invalid characters.
///
/// The keywords to cleanup.
/// The clean keywords.
public static string[] CleanupKeywords(string[] keywords) {
if(keywords == null || keywords.Length == 0) return keywords;
List result = new List(keywords.Length);
foreach(string k in keywords) {
string temp = RemoveDiacriticsAndPunctuation(k.Replace(" ", ""), true);
if(temp.Length > 0) {
result.Add(temp);
}
}
return result.ToArray();
}
///
/// Removes "accents" and punctuation from a string, transforming it to lowercase (culture invariant).
///
/// The input string.
/// A value indicating whether the input string is a single word.
/// The normalized string (lowercase, culture invariant). Can be empty.
public static string RemoveDiacriticsAndPunctuation(string input, bool isSingleWord) {
// Code partially borrowed from:
// http://weblogs.asp.net/fmarguerie/archive/2006/10/30/removing-diacritics-accents-from-strings.aspx
string normalizedString = input.Normalize(NormalizationForm.FormD);
StringBuilder stringBuilder = new StringBuilder(input.Length);
for(int i = 0; i < normalizedString.Length; i++) {
char c = normalizedString[i];
if(char.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark) {
if(char.IsLetterOrDigit(c)) stringBuilder.Append(c);
else if(!isSingleWord) stringBuilder.Append(" ");
}
}
if(!isSingleWord) {
while(stringBuilder.ToString().Contains(" ")) stringBuilder.Replace(" ", " ");
}
return stringBuilder.ToString().ToLowerInvariant().Trim(' ', '\'', '"');
}
///
/// Determines whether a char is a split char.
///
/// The current char.
/// true if the char is a split char, false otherwise.
public static bool IsSplitChar(char current) {
UnicodeCategory cat = char.GetUnicodeCategory(current);
// http://msdn.microsoft.com/en-us/library/system.globalization.unicodecategory.aspx
// A split char is anything but the following categories
return
cat != UnicodeCategory.UppercaseLetter &&
cat != UnicodeCategory.LowercaseLetter &&
cat != UnicodeCategory.TitlecaseLetter &&
cat != UnicodeCategory.ModifierLetter &&
cat != UnicodeCategory.OtherLetter &&
cat != UnicodeCategory.NonSpacingMark &&
cat != UnicodeCategory.DecimalDigitNumber &&
cat != UnicodeCategory.LetterNumber &&
cat != UnicodeCategory.OtherNumber &&
cat != UnicodeCategory.CurrencySymbol;
}
///
/// Computes the index of the first non-split char given a start index.
///
/// The start index.
/// The content.
/// The index of the first non-split char.
/// If is null.
public static ushort SkipSplitChars(ushort startIndex, string content) {
if(content == null) throw new ArgumentNullException("content");
// startIndex < 0 is not actually a problem, so it's possible to set it to zero
if(startIndex < 0) startIndex = 0;
int currentIndex = startIndex;
while(currentIndex < content.Length && IsSplitChar(content[currentIndex])) currentIndex++;
return (ushort)currentIndex;
}
///
/// Tokenizes a string.
///
/// The text to tokenize.
/// The location of the words that are extracted.
/// The tokens.
/// If is null.
public static WordInfo[] Tokenize(string text, WordLocation location) {
if(text == null) throw new ArgumentNullException("text");
List words = new List(text.Length / 5); // Average 5 chars/word
ushort currentIndex = 0, currentWordStart;
// Skip all trailing splitChars
currentIndex = SkipSplitChars(0, text);
currentWordStart = currentIndex;
while(currentIndex < text.Length && currentIndex < 65500) {
while(currentIndex < text.Length && !Tools.IsSplitChar(text[currentIndex])) currentIndex++;
string w = text.Substring(currentWordStart, currentIndex - currentWordStart);
w = Tools.RemoveDiacriticsAndPunctuation(w, true);
if(!string.IsNullOrEmpty(w)) {
words.Add(new WordInfo(w, currentWordStart, (ushort)words.Count, location));
}
currentIndex = SkipSplitChars((ushort)(currentIndex + 1), text);
currentWordStart = currentIndex;
}
return words.ToArray();
}
///
/// Tokenizes a string.
///
/// The text to tokenize.
public static WordInfo[] Tokenize(string text) {
return Tokenize(text, WordLocation.Content);
}
///
/// Removes stop words from a set of words (case insensitive).
///
/// The input words.
/// The array of stop words.
/// The input words without the stop words.
/// If or are null.
public static WordInfo[] RemoveStopWords(WordInfo[] words, string[] stopWords) {
if(words == null) throw new ArgumentNullException("words");
if(stopWords == null) throw new ArgumentNullException("stopWords");
List result = new List(words.Length);
foreach(WordInfo current in words) {
bool found = false;
foreach(string sw in stopWords) {
if(string.Compare(current.Text, sw, true, CultureInfo.InvariantCulture) == 0) {
found = true;
break;
}
}
if(!found) result.Add(current);
}
return result.ToArray();
}
}
///
/// Defines the interface for a component that fetches words.
///
public interface IWordFetcher : IDisposable {
///
/// Tries to get a word.
///
/// The text of the word.
/// The found word, if any, null otherwise.
/// true if the word is found, false otherwise.
bool TryGetWord(string text, out Word word);
}
}