using System; using System.Collections.Generic; using System.Text; using System.IO; using ScrewTurn.Wiki.SearchEngine; namespace ScrewTurn.Wiki { /// /// Stores index data to disk. /// /// Instance and static members are thread-safe. public class IndexStorer : IndexStorerBase { private static readonly byte[] ReservedBytes = new byte[] { 2, 0, 0, 0, 0, 0, 0, 0 }; private static readonly int Zero = 0; private string documentsFile, wordsFile, mappingsFile; private uint firstFreeDocumentId = 1; private uint firstFreeWordId = 1; // Documents file binary format // Reserved(8bytes) Count(int) Entries... // ID(int) Name(string) Title(string) TypeTag(string) DateTime(long) // Words file binary format // Reserved(8bytes) Count(int) Entries... // ID(int) Text(string) // Mappings file binary format // Reserved(8bytes) Count(int) Entries... // WordID(int) DocumentID(int) FirstCharIndex(int) WordIndex(int) Location(int) /// /// Initializes a new instance of the class. /// /// The file that contains the documents list. /// The file that contains the words list. /// The file that contains the index mappings data. /// The index to manage. public IndexStorer(string documentsFile, string wordsFile, string mappingsFile, IInMemoryIndex index) : base(index) { if(documentsFile == null) throw new ArgumentNullException("documentsFile"); if(wordsFile == null) throw new ArgumentNullException("wordsFile"); if(mappingsFile == null) throw new ArgumentNullException("mappingsFile"); if(documentsFile.Length == 0) throw new ArgumentException("Documents File cannot be empty", "documentsFile"); if(wordsFile.Length == 0) throw new ArgumentException("Words File cannot be emtpy", "wordsFile"); if(mappingsFile.Length == 0) throw new ArgumentException("Mappings File cannot be empty", "mappingsFile"); this.documentsFile = documentsFile; this.wordsFile = wordsFile; this.mappingsFile = mappingsFile; InitFiles(); } /// /// Gets the approximate size, in bytes, of the search engine index. /// public override long Size { get { lock(this) { long size = 0; FileInfo fi; fi = new FileInfo(documentsFile); size += fi.Length; fi = new FileInfo(wordsFile); size += fi.Length; fi = new FileInfo(mappingsFile); size += fi.Length; return size; } } } /// /// Loads the index from the data store the first time. /// /// The dumped documents. /// The dumped words. /// The dumped word mappings. protected override void LoadIndexInternal(out DumpedDocument[] documents, out DumpedWord[] words, out DumpedWordMapping[] mappings) { uint maxDocumentId = 0; uint maxWordId = 0; // 1. Load Documents using(FileStream fs = new FileStream(documentsFile, FileMode.Open, FileAccess.Read, FileShare.None)) { int count = ReadCount(fs); BinaryReader reader = new BinaryReader(fs, Encoding.UTF8); documents = new DumpedDocument[count]; for(int i = 0; i < count; i++) { documents[i] = ReadDumpedDocument(reader); if(documents[i].ID > maxDocumentId) maxDocumentId = documents[i].ID; } firstFreeDocumentId = maxDocumentId + 1; } // 2. Load Words using(FileStream fs = new FileStream(wordsFile, FileMode.Open, FileAccess.Read, FileShare.None)) { int count = ReadCount(fs); BinaryReader reader = new BinaryReader(fs, Encoding.UTF8); words = new DumpedWord[count]; for(int i = 0; i < count; i++) { words[i] = ReadDumpedWord(reader); if(words[i].ID > maxWordId) maxWordId = words[i].ID; } firstFreeWordId = maxWordId + 1; } // 3. Load Mappings using(FileStream fs = new FileStream(mappingsFile, FileMode.Open, FileAccess.Read, FileShare.None)) { int count = ReadCount(fs); BinaryReader reader = new BinaryReader(fs, Encoding.UTF8); mappings = new DumpedWordMapping[count]; for(int i = 0; i < count; i++) { mappings[i] = ReadDumpedWordMapping(reader); } } } /// /// Reads the reserved bytes. /// /// The to read from. /// true if read bytes are equal to expected bytes, false otherwise. private static bool ReadReserved(BinaryReader reader) { bool allEqual = true; for(int i = 0; i < ReservedBytes.Length; i++) { int r = reader.ReadByte(); if(r != ReservedBytes[i]) allEqual = false; } return allEqual; } /// /// Initializes the data files, if needed. /// private void InitFiles() { if(!File.Exists(documentsFile)) { using(FileStream fs = new FileStream(documentsFile, FileMode.Create, FileAccess.Write, FileShare.None)) { BinaryWriter writer = new BinaryWriter(fs, Encoding.UTF8); WriteHeader(writer); } } if(!File.Exists(wordsFile)) { using(FileStream fs = new FileStream(wordsFile, FileMode.Create, FileAccess.Write, FileShare.None)) { BinaryWriter writer = new BinaryWriter(fs, Encoding.UTF8); WriteHeader(writer); } } if(!File.Exists(mappingsFile)) { using(FileStream fs = new FileStream(mappingsFile, FileMode.Create, FileAccess.Write, FileShare.None)) { BinaryWriter writer = new BinaryWriter(fs, Encoding.UTF8); WriteHeader(writer); } } } /// /// Initializes the data storage. /// /// A state object passed from the index. protected override void InitDataStore(object state) { using(FileStream fs = new FileStream(documentsFile, FileMode.Create, FileAccess.Write, FileShare.None)) { BinaryWriter writer = new BinaryWriter(fs, Encoding.UTF8); WriteHeader(writer); } using(FileStream fs = new FileStream(wordsFile, FileMode.Create, FileAccess.Write, FileShare.None)) { BinaryWriter writer = new BinaryWriter(fs, Encoding.UTF8); WriteHeader(writer); } using(FileStream fs = new FileStream(mappingsFile, FileMode.Create, FileAccess.Write, FileShare.None)) { BinaryWriter writer = new BinaryWriter(fs, Encoding.UTF8); WriteHeader(writer); } } /// /// Writes the binary file header. /// /// The to write into. private static void WriteHeader(BinaryWriter writer) { writer.Write(ReservedBytes); writer.Write(Zero); } /// /// Reads a from a . /// /// The . /// The . private static DumpedDocument ReadDumpedDocument(BinaryReader reader) { uint id; string name, title, typeTag; DateTime dateTime; id = reader.ReadUInt32(); name = reader.ReadString(); title = reader.ReadString(); typeTag = reader.ReadString(); dateTime = DateTime.FromBinary(reader.ReadInt64()); return new DumpedDocument(id, name, title, typeTag, dateTime); } /// /// Reads a from a . /// /// The . /// The . private static DumpedWord ReadDumpedWord(BinaryReader reader) { uint id; string text; id = reader.ReadUInt32(); text = reader.ReadString(); return new DumpedWord(id, text); } /// /// Reads a from a . /// /// The . /// The . private static DumpedWordMapping ReadDumpedWordMapping(BinaryReader reader) { uint wordId; uint documentId; ushort firstCharIndex, wordIndex; byte location; wordId = reader.ReadUInt32(); documentId = reader.ReadUInt32(); firstCharIndex = reader.ReadUInt16(); wordIndex = reader.ReadUInt16(); location = reader.ReadByte(); return new DumpedWordMapping(wordId, documentId, firstCharIndex, wordIndex, location); } /// /// Reads the count in a . /// /// The , at position zero. /// The count. /// The caller must properly seek the stream after calling the method. private static int ReadCount(FileStream fs) { BinaryReader reader = new BinaryReader(fs, Encoding.UTF8); if(!ReadReserved(reader)) { throw new InvalidOperationException("Invalid index file header"); } return reader.ReadInt32(); } /// /// Stores new data into the data storage. /// /// The data to store. /// A state object passed from the index. /// The storer result, if any. /// When saving a new document, the document ID in data.Mappings must be /// replaced with the currect document ID, generated by the concrete implementation of /// this method. data.Words should have IDs numbered from uint.MaxValue downwards. /// The method re-numbers the words appropriately. protected override IndexStorerResult SaveData(DumpedChange data, object state) { IndexStorerResult result = new IndexStorerResult(null, null); // 1. Save Document using(FileStream fs = new FileStream(documentsFile, FileMode.Open, FileAccess.ReadWrite, FileShare.None)) { int count = ReadCount(fs); // Update count and append document BinaryWriter writer = new BinaryWriter(fs, Encoding.UTF8); fs.Seek(-4, SeekOrigin.Current); writer.Write(count + 1); writer.Seek(0, SeekOrigin.End); data.Document.ID = firstFreeDocumentId; WriteDumpedDocument(writer, data.Document); result.DocumentID = firstFreeDocumentId; firstFreeDocumentId++; } // 2. Save Words Dictionary wordIds = null; using(FileStream fs = new FileStream(wordsFile, FileMode.Open, FileAccess.ReadWrite, FileShare.None)) { int count = ReadCount(fs); // Update count and append words BinaryWriter writer = new BinaryWriter(fs, Encoding.UTF8); fs.Seek(-4, SeekOrigin.Current); writer.Write(count + data.Words.Count); fs.Seek(0, SeekOrigin.End); wordIds = new Dictionary(data.Words.Count); foreach(DumpedWord dw in data.Words) { wordIds.Add(dw.ID, new WordId(dw.Text, firstFreeWordId)); dw.ID = firstFreeWordId; WriteDumpedWord(writer, dw); firstFreeWordId++; } result.WordIDs = new List(wordIds.Values); } // 3. Save Mappings using(FileStream fs = new FileStream(mappingsFile, FileMode.Open, FileAccess.ReadWrite, FileShare.None)) { int count = ReadCount(fs); // Update count and append mappings BinaryWriter writer = new BinaryWriter(fs, Encoding.UTF8); fs.Seek(-4, SeekOrigin.Current); writer.Write(count + data.Mappings.Count); fs.Seek(0, SeekOrigin.End); foreach(DumpedWordMapping map in data.Mappings) { // Words are autonumbered from uint.MaxValue downwards by IndexBase so that // IndexStorer can identify the DumpedWordMappings easily and // fix the IDs with the ones actually stored WordId newMappingWordId; if(wordIds != null && wordIds.TryGetValue(map.WordID, out newMappingWordId)) { map.WordID = newMappingWordId.ID; } WriteDumpedWordMapping(writer, new DumpedWordMapping(map.WordID, result.DocumentID.Value, map.FirstCharIndex, map.WordIndex, map.Location)); } } return result; } /// /// Gets a tempDumpedWord file name given an original name. /// /// The original name. /// The tempDumpedWord file name. private static string GetTempFile(string file) { string folder = Path.GetDirectoryName(file); string name = Path.GetFileNameWithoutExtension(file) + "_Temp" + Path.GetExtension(file); return Path.Combine(folder, name); } /// /// Deletes data from the data storage. /// /// The data to delete. /// A state object passed from the index. protected override void DeleteData(DumpedChange data, object state) { // Files are regenerated in a tempDumpedWord location and copied back string tempDocumentsFile = GetTempFile(documentsFile); string tempWordsFile = GetTempFile(wordsFile); string tempMappingsFile = GetTempFile(mappingsFile); // 1. Remove Mappings using(FileStream fsi = new FileStream(mappingsFile, FileMode.Open, FileAccess.Read, FileShare.None)) { int count = ReadCount(fsi); int countLocation = (int)fsi.Position - 4; int writeCount = 0; BinaryReader reader = new BinaryReader(fsi, Encoding.UTF8); using(FileStream fso = new FileStream(tempMappingsFile, FileMode.Create, FileAccess.Write, FileShare.None)) { BinaryWriter writer = new BinaryWriter(fso, Encoding.UTF8); WriteHeader(writer); DumpedWordMapping m; for(int i = 0; i < count; i++) { m = ReadDumpedWordMapping(reader); // If m is not contained in data.Mappings, store it in tempDumpedWord file if(!Find(m, data.Mappings)) { WriteDumpedWordMapping(writer, m); writeCount++; } } writer.Seek(countLocation, SeekOrigin.Begin); writer.Write(writeCount); } } // Replace the file File.Copy(tempMappingsFile, mappingsFile, true); File.Delete(tempMappingsFile); // 2. Remove Words using(FileStream fsi = new FileStream(wordsFile, FileMode.Open, FileAccess.Read, FileShare.None)) { int count = ReadCount(fsi); int countLocation = (int)fsi.Position - 4; int writeCount = 0; BinaryReader reader = new BinaryReader(fsi, Encoding.UTF8); using(FileStream fso = new FileStream(tempWordsFile, FileMode.Create, FileAccess.Write, FileShare.None)) { BinaryWriter writer = new BinaryWriter(fso, Encoding.UTF8); WriteHeader(writer); DumpedWord w; for(int i = 0; i < count; i++) { w = ReadDumpedWord(reader); // If w is not contained in data.Words, store it in tempDumpedWord file if(!Find(w, data.Words)) { WriteDumpedWord(writer, w); writeCount++; } } writer.Seek(countLocation, SeekOrigin.Begin); writer.Write(writeCount); } } // Replace the file File.Copy(tempWordsFile, wordsFile, true); File.Delete(tempWordsFile); // 3. Remove Document using(FileStream fsi = new FileStream(documentsFile, FileMode.Open, FileAccess.Read, FileShare.None)) { int count = ReadCount(fsi); int countLocation = (int)fsi.Position - 4; BinaryReader reader = new BinaryReader(fsi, Encoding.UTF8); using(FileStream fso = new FileStream(tempDocumentsFile, FileMode.Create, FileAccess.Write, FileShare.None)) { BinaryWriter writer = new BinaryWriter(fso, Encoding.UTF8); WriteHeader(writer); DumpedDocument d; for(int i = 0; i < count; i++) { d = ReadDumpedDocument(reader); // If d is not equal to data.Document (to be deleted), then copy it to the result file if(!EqualDumpedDocument(d, data.Document)) { WriteDumpedDocument(writer, d); } } writer.Seek(countLocation, SeekOrigin.Begin); writer.Write(count - 1); } } File.Copy(tempDocumentsFile, documentsFile, true); File.Delete(tempDocumentsFile); } /// /// Writes a to a . /// /// The . /// The . private static void WriteDumpedDocument(BinaryWriter writer, DumpedDocument document) { writer.Write(document.ID); writer.Write(document.Name); writer.Write(document.Title); writer.Write(document.TypeTag); writer.Write(document.DateTime.ToBinary()); } /// /// Writes a to a . /// /// The . /// The . private static void WriteDumpedWord(BinaryWriter writer, DumpedWord word) { //if(word.Text.Length == 0) throw new InvalidOperationException(); writer.Write(word.ID); writer.Write(word.Text); } /// /// Writes a to a . /// /// The . /// The . private static void WriteDumpedWordMapping(BinaryWriter writer, DumpedWordMapping mapping) { writer.Write(mapping.WordID); writer.Write(mapping.DocumentID); writer.Write(mapping.FirstCharIndex); writer.Write(mapping.WordIndex); writer.Write(mapping.Location); } /// /// Determines whether two s are equal. /// /// The first document. /// The second document. /// true if the documents are equal, false otherwise. private static bool EqualDumpedDocument(DumpedDocument d1, DumpedDocument d2) { // Only consider ID, Name and TypeTag //return d1.ID == d2.ID && d1.Name == d2.Name && d1.Title == d2.Title && // d1.TypeTag == d2.TypeTag && d1.DateTime == d2.DateTime; return d1.ID == d2.ID && d1.Name == d2.Name && d1.TypeTag == d2.TypeTag; } } }