screwturn-4/Core/IndexStorer.cs
2009-09-30 13:47:13 +00:00

491 lines
18 KiB
C#

using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using ScrewTurn.Wiki.SearchEngine;
namespace ScrewTurn.Wiki {
/// <summary>
/// Stores index data to disk.
/// </summary>
/// <remarks>Instance and static members are <b>thread-safe</b>.</remarks>
public class IndexStorer : IndexStorerBase {
private static readonly byte[] ReservedBytes = new byte[] { 2, 0, 0, 0, 0, 0, 0, 0 };
private static readonly int Zero = 0;
private string documentsFile, wordsFile, mappingsFile;
private uint firstFreeDocumentId = 1;
private uint firstFreeWordId = 1;
// Documents file binary format
// Reserved(8bytes) Count(int) Entries...
// ID(int) Name(string) Title(string) TypeTag(string) DateTime(long)
// Words file binary format
// Reserved(8bytes) Count(int) Entries...
// ID(int) Text(string)
// Mappings file binary format
// Reserved(8bytes) Count(int) Entries...
// WordID(int) DocumentID(int) FirstCharIndex(int) WordIndex(int) Location(int)
/// <summary>
/// Initializes a new instance of the <see cref="IndexStorer" /> class.
/// </summary>
/// <param name="documentsFile">The file that contains the documents list.</param>
/// <param name="wordsFile">The file that contains the words list.</param>
/// <param name="mappingsFile">The file that contains the index mappings data.</param>
/// <param name="index">The index to manage.</param>
public IndexStorer(string documentsFile, string wordsFile, string mappingsFile, IInMemoryIndex index)
: base(index) {
if(documentsFile == null) throw new ArgumentNullException("documentsFile");
if(wordsFile == null) throw new ArgumentNullException("wordsFile");
if(mappingsFile == null) throw new ArgumentNullException("mappingsFile");
if(documentsFile.Length == 0) throw new ArgumentException("Documents File cannot be empty", "documentsFile");
if(wordsFile.Length == 0) throw new ArgumentException("Words File cannot be emtpy", "wordsFile");
if(mappingsFile.Length == 0) throw new ArgumentException("Mappings File cannot be empty", "mappingsFile");
this.documentsFile = documentsFile;
this.wordsFile = wordsFile;
this.mappingsFile = mappingsFile;
InitFiles();
}
/// <summary>
/// Gets the approximate size, in bytes, of the search engine index.
/// </summary>
public override long Size {
get {
lock(this) {
long size = 0;
FileInfo fi;
fi = new FileInfo(documentsFile);
size += fi.Length;
fi = new FileInfo(wordsFile);
size += fi.Length;
fi = new FileInfo(mappingsFile);
size += fi.Length;
return size;
}
}
}
/// <summary>
/// Loads the index from the data store the first time.
/// </summary>
/// <param name="documents">The dumped documents.</param>
/// <param name="words">The dumped words.</param>
/// <param name="mappings">The dumped word mappings.</param>
protected override void LoadIndexInternal(out DumpedDocument[] documents, out DumpedWord[] words, out DumpedWordMapping[] mappings) {
uint maxDocumentId = 0;
uint maxWordId = 0;
// 1. Load Documents
using(FileStream fs = new FileStream(documentsFile, FileMode.Open, FileAccess.Read, FileShare.None)) {
int count = ReadCount(fs);
BinaryReader reader = new BinaryReader(fs, Encoding.UTF8);
documents = new DumpedDocument[count];
for(int i = 0; i < count; i++) {
documents[i] = ReadDumpedDocument(reader);
if(documents[i].ID > maxDocumentId) maxDocumentId = documents[i].ID;
}
firstFreeDocumentId = maxDocumentId + 1;
}
// 2. Load Words
using(FileStream fs = new FileStream(wordsFile, FileMode.Open, FileAccess.Read, FileShare.None)) {
int count = ReadCount(fs);
BinaryReader reader = new BinaryReader(fs, Encoding.UTF8);
words = new DumpedWord[count];
for(int i = 0; i < count; i++) {
words[i] = ReadDumpedWord(reader);
if(words[i].ID > maxWordId) maxWordId = words[i].ID;
}
firstFreeWordId = maxWordId + 1;
}
// 3. Load Mappings
using(FileStream fs = new FileStream(mappingsFile, FileMode.Open, FileAccess.Read, FileShare.None)) {
int count = ReadCount(fs);
BinaryReader reader = new BinaryReader(fs, Encoding.UTF8);
mappings = new DumpedWordMapping[count];
for(int i = 0; i < count; i++) {
mappings[i] = ReadDumpedWordMapping(reader);
}
}
}
/// <summary>
/// Reads the reserved bytes.
/// </summary>
/// <param name="reader">The <see cref="BinaryReader" /> to read from.</param>
/// <returns><c>true</c> if read bytes are equal to expected bytes, <c>false</c> otherwise.</returns>
private static bool ReadReserved(BinaryReader reader) {
bool allEqual = true;
for(int i = 0; i < ReservedBytes.Length; i++) {
int r = reader.ReadByte();
if(r != ReservedBytes[i]) allEqual = false;
}
return allEqual;
}
/// <summary>
/// Initializes the data files, if needed.
/// </summary>
private void InitFiles() {
if(!File.Exists(documentsFile)) {
using(FileStream fs = new FileStream(documentsFile, FileMode.Create, FileAccess.Write, FileShare.None)) {
BinaryWriter writer = new BinaryWriter(fs, Encoding.UTF8);
WriteHeader(writer);
}
}
if(!File.Exists(wordsFile)) {
using(FileStream fs = new FileStream(wordsFile, FileMode.Create, FileAccess.Write, FileShare.None)) {
BinaryWriter writer = new BinaryWriter(fs, Encoding.UTF8);
WriteHeader(writer);
}
}
if(!File.Exists(mappingsFile)) {
using(FileStream fs = new FileStream(mappingsFile, FileMode.Create, FileAccess.Write, FileShare.None)) {
BinaryWriter writer = new BinaryWriter(fs, Encoding.UTF8);
WriteHeader(writer);
}
}
}
/// <summary>
/// Initializes the data storage.
/// </summary>
/// <param name="state">A state object passed from the index.</param>
protected override void InitDataStore(object state) {
using(FileStream fs = new FileStream(documentsFile, FileMode.Create, FileAccess.Write, FileShare.None)) {
BinaryWriter writer = new BinaryWriter(fs, Encoding.UTF8);
WriteHeader(writer);
}
using(FileStream fs = new FileStream(wordsFile, FileMode.Create, FileAccess.Write, FileShare.None)) {
BinaryWriter writer = new BinaryWriter(fs, Encoding.UTF8);
WriteHeader(writer);
}
using(FileStream fs = new FileStream(mappingsFile, FileMode.Create, FileAccess.Write, FileShare.None)) {
BinaryWriter writer = new BinaryWriter(fs, Encoding.UTF8);
WriteHeader(writer);
}
}
/// <summary>
/// Writes the binary file header.
/// </summary>
/// <param name="writer">The <see cref="BinaryWriter" /> to write into.</param>
private static void WriteHeader(BinaryWriter writer) {
writer.Write(ReservedBytes);
writer.Write(Zero);
}
/// <summary>
/// Reads a <see cref="DumpedDocument" /> from a <see cref="BinaryReader" />.
/// </summary>
/// <param name="reader">The <see cref="BinaryReader" />.</param>
/// <returns>The <see cref="DumpedDocument" />.</returns>
private static DumpedDocument ReadDumpedDocument(BinaryReader reader) {
uint id;
string name, title, typeTag;
DateTime dateTime;
id = reader.ReadUInt32();
name = reader.ReadString();
title = reader.ReadString();
typeTag = reader.ReadString();
dateTime = DateTime.FromBinary(reader.ReadInt64());
return new DumpedDocument(id, name, title, typeTag, dateTime);
}
/// <summary>
/// Reads a <see cref="DumpedWord" /> from a <see cref="BinaryReader" />.
/// </summary>
/// <param name="reader">The <see cref="BinaryReader" />.</param>
/// <returns>The <see cref="DumpedWord" />.</returns>
private static DumpedWord ReadDumpedWord(BinaryReader reader) {
uint id;
string text;
id = reader.ReadUInt32();
text = reader.ReadString();
return new DumpedWord(id, text);
}
/// <summary>
/// Reads a <see cref="DumpedWordMapping" /> from a <see cref="BinaryReader" />.
/// </summary>
/// <param name="reader">The <see cref="BinaryReader" />.</param>
/// <returns>The <see cref="DumpedWordMapping" />.</returns>
private static DumpedWordMapping ReadDumpedWordMapping(BinaryReader reader) {
uint wordId;
uint documentId;
ushort firstCharIndex, wordIndex;
byte location;
wordId = reader.ReadUInt32();
documentId = reader.ReadUInt32();
firstCharIndex = reader.ReadUInt16();
wordIndex = reader.ReadUInt16();
location = reader.ReadByte();
return new DumpedWordMapping(wordId, documentId, firstCharIndex, wordIndex, location);
}
/// <summary>
/// Reads the count in a <see cref="FileStream" />.
/// </summary>
/// <param name="fs">The <see cref="FileStream" />, at position <b>zero</b>.</param>
/// <returns>The count.</returns>
/// <remarks>The caller must properly seek the stream after calling the method.</remarks>
private static int ReadCount(FileStream fs) {
BinaryReader reader = new BinaryReader(fs, Encoding.UTF8);
if(!ReadReserved(reader)) {
throw new InvalidOperationException("Invalid index file header");
}
return reader.ReadInt32();
}
/// <summary>
/// Stores new data into the data storage.
/// </summary>
/// <param name="data">The data to store.</param>
/// <param name="state">A state object passed from the index.</param>
/// <returns>The storer result, if any.</returns>
/// <remarks>When saving a new document, the document ID in data.Mappings must be
/// replaced with the currect document ID, generated by the concrete implementation of
/// this method. data.Words should have IDs numbered from uint.MaxValue downwards.
/// The method re-numbers the words appropriately.</remarks>
protected override IndexStorerResult SaveData(DumpedChange data, object state) {
IndexStorerResult result = new IndexStorerResult(null, null);
// 1. Save Document
using(FileStream fs = new FileStream(documentsFile, FileMode.Open, FileAccess.ReadWrite, FileShare.None)) {
int count = ReadCount(fs);
// Update count and append document
BinaryWriter writer = new BinaryWriter(fs, Encoding.UTF8);
fs.Seek(-4, SeekOrigin.Current);
writer.Write(count + 1);
writer.Seek(0, SeekOrigin.End);
data.Document.ID = firstFreeDocumentId;
WriteDumpedDocument(writer, data.Document);
result.DocumentID = firstFreeDocumentId;
firstFreeDocumentId++;
}
// 2. Save Words
Dictionary<uint, WordId> wordIds = null;
using(FileStream fs = new FileStream(wordsFile, FileMode.Open, FileAccess.ReadWrite, FileShare.None)) {
int count = ReadCount(fs);
// Update count and append words
BinaryWriter writer = new BinaryWriter(fs, Encoding.UTF8);
fs.Seek(-4, SeekOrigin.Current);
writer.Write(count + data.Words.Count);
fs.Seek(0, SeekOrigin.End);
wordIds = new Dictionary<uint, WordId>(data.Words.Count);
foreach(DumpedWord dw in data.Words) {
wordIds.Add(dw.ID, new WordId(dw.Text, firstFreeWordId));
dw.ID = firstFreeWordId;
WriteDumpedWord(writer, dw);
firstFreeWordId++;
}
result.WordIDs = new List<WordId>(wordIds.Values);
}
// 3. Save Mappings
using(FileStream fs = new FileStream(mappingsFile, FileMode.Open, FileAccess.ReadWrite, FileShare.None)) {
int count = ReadCount(fs);
// Update count and append mappings
BinaryWriter writer = new BinaryWriter(fs, Encoding.UTF8);
fs.Seek(-4, SeekOrigin.Current);
writer.Write(count + data.Mappings.Count);
fs.Seek(0, SeekOrigin.End);
foreach(DumpedWordMapping map in data.Mappings) {
// Words are autonumbered from uint.MaxValue downwards by IndexBase so that
// IndexStorer can identify the DumpedWordMappings easily and
// fix the IDs with the ones actually stored
WordId newMappingWordId;
if(wordIds != null && wordIds.TryGetValue(map.WordID, out newMappingWordId)) {
map.WordID = newMappingWordId.ID;
}
WriteDumpedWordMapping(writer,
new DumpedWordMapping(map.WordID, result.DocumentID.Value,
map.FirstCharIndex, map.WordIndex, map.Location));
}
}
return result;
}
/// <summary>
/// Gets a tempDumpedWord file name given an original name.
/// </summary>
/// <param name="file">The original name.</param>
/// <returns>The tempDumpedWord file name.</returns>
private static string GetTempFile(string file) {
string folder = Path.GetDirectoryName(file);
string name = Path.GetFileNameWithoutExtension(file) + "_Temp" + Path.GetExtension(file);
return Path.Combine(folder, name);
}
/// <summary>
/// Deletes data from the data storage.
/// </summary>
/// <param name="data">The data to delete.</param>
/// <param name="state">A state object passed from the index.</param>
protected override void DeleteData(DumpedChange data, object state) {
// Files are regenerated in a tempDumpedWord location and copied back
string tempDocumentsFile = GetTempFile(documentsFile);
string tempWordsFile = GetTempFile(wordsFile);
string tempMappingsFile = GetTempFile(mappingsFile);
// 1. Remove Mappings
using(FileStream fsi = new FileStream(mappingsFile, FileMode.Open, FileAccess.Read, FileShare.None)) {
int count = ReadCount(fsi);
int countLocation = (int)fsi.Position - 4;
int writeCount = 0;
BinaryReader reader = new BinaryReader(fsi, Encoding.UTF8);
using(FileStream fso = new FileStream(tempMappingsFile, FileMode.Create, FileAccess.Write, FileShare.None)) {
BinaryWriter writer = new BinaryWriter(fso, Encoding.UTF8);
WriteHeader(writer);
DumpedWordMapping m;
for(int i = 0; i < count; i++) {
m = ReadDumpedWordMapping(reader);
// If m is not contained in data.Mappings, store it in tempDumpedWord file
if(!Find(m, data.Mappings)) {
WriteDumpedWordMapping(writer, m);
writeCount++;
}
}
writer.Seek(countLocation, SeekOrigin.Begin);
writer.Write(writeCount);
}
}
// Replace the file
File.Copy(tempMappingsFile, mappingsFile, true);
File.Delete(tempMappingsFile);
// 2. Remove Words
using(FileStream fsi = new FileStream(wordsFile, FileMode.Open, FileAccess.Read, FileShare.None)) {
int count = ReadCount(fsi);
int countLocation = (int)fsi.Position - 4;
int writeCount = 0;
BinaryReader reader = new BinaryReader(fsi, Encoding.UTF8);
using(FileStream fso = new FileStream(tempWordsFile, FileMode.Create, FileAccess.Write, FileShare.None)) {
BinaryWriter writer = new BinaryWriter(fso, Encoding.UTF8);
WriteHeader(writer);
DumpedWord w;
for(int i = 0; i < count; i++) {
w = ReadDumpedWord(reader);
// If w is not contained in data.Words, store it in tempDumpedWord file
if(!Find(w, data.Words)) {
WriteDumpedWord(writer, w);
writeCount++;
}
}
writer.Seek(countLocation, SeekOrigin.Begin);
writer.Write(writeCount);
}
}
// Replace the file
File.Copy(tempWordsFile, wordsFile, true);
File.Delete(tempWordsFile);
// 3. Remove Document
using(FileStream fsi = new FileStream(documentsFile, FileMode.Open, FileAccess.Read, FileShare.None)) {
int count = ReadCount(fsi);
int countLocation = (int)fsi.Position - 4;
BinaryReader reader = new BinaryReader(fsi, Encoding.UTF8);
using(FileStream fso = new FileStream(tempDocumentsFile, FileMode.Create, FileAccess.Write, FileShare.None)) {
BinaryWriter writer = new BinaryWriter(fso, Encoding.UTF8);
WriteHeader(writer);
DumpedDocument d;
for(int i = 0; i < count; i++) {
d = ReadDumpedDocument(reader);
// If d is not equal to data.Document (to be deleted), then copy it to the result file
if(!EqualDumpedDocument(d, data.Document)) {
WriteDumpedDocument(writer, d);
}
}
writer.Seek(countLocation, SeekOrigin.Begin);
writer.Write(count - 1);
}
}
File.Copy(tempDocumentsFile, documentsFile, true);
File.Delete(tempDocumentsFile);
}
/// <summary>
/// Writes a <see cref="DumpedDocument" /> to a <see cref="BinaryWriter" />.
/// </summary>
/// <param name="writer">The <see cref="BinaryWriter" />.</param>
/// <param name="document">The <see cref="DumpedDocument" />.</param>
private static void WriteDumpedDocument(BinaryWriter writer, DumpedDocument document) {
writer.Write(document.ID);
writer.Write(document.Name);
writer.Write(document.Title);
writer.Write(document.TypeTag);
writer.Write(document.DateTime.ToBinary());
}
/// <summary>
/// Writes a <see cref="DumpedWord" /> to a <see cref="BinaryWriter" />.
/// </summary>
/// <param name="writer">The <see cref="BinaryWriter" />.</param>
/// <param name="word">The <see cref="DumpedWord" />.</param>
private static void WriteDumpedWord(BinaryWriter writer, DumpedWord word) {
//if(word.Text.Length == 0) throw new InvalidOperationException();
writer.Write(word.ID);
writer.Write(word.Text);
}
/// <summary>
/// Writes a <see cref="DumpedWordMapping" /> to a <see cref="BinaryWriter" />.
/// </summary>
/// <param name="writer">The <see cref="BinaryWriter" />.</param>
/// <param name="mapping">The <see cref="DumpedWordMapping" />.</param>
private static void WriteDumpedWordMapping(BinaryWriter writer, DumpedWordMapping mapping) {
writer.Write(mapping.WordID);
writer.Write(mapping.DocumentID);
writer.Write(mapping.FirstCharIndex);
writer.Write(mapping.WordIndex);
writer.Write(mapping.Location);
}
/// <summary>
/// Determines whether two <see cref="DumpedDocument" />s are equal.
/// </summary>
/// <param name="d1">The first document.</param>
/// <param name="d2">The second document.</param>
/// <returns><c>true</c> if the documents are equal, <c>false</c> otherwise.</returns>
private static bool EqualDumpedDocument(DumpedDocument d1, DumpedDocument d2) {
// Only consider ID, Name and TypeTag
//return d1.ID == d2.ID && d1.Name == d2.Name && d1.Title == d2.Title &&
// d1.TypeTag == d2.TypeTag && d1.DateTime == d2.DateTime;
return d1.ID == d2.ID && d1.Name == d2.Name && d1.TypeTag == d2.TypeTag;
}
}
}