experimental statistical language classification system

2025-08-03 00:02:00 +02:00 · 2015-12-28 14:32:43 -06:00 · 2015-12-28 14:32:43 -06:00 · 242cc6e92d
commit 242cc6e92d
parent 9cc85a48bb
15 changed files with 178 additions and 37 deletions
--- a/models/site.rb
+++ b/models/site.rb
@ -420,7 +420,7 @@ class Site < Sequel::Model
  end

  def get_file(path)
-    File.read files_path(path)
+    File.read current_files_path(path)
  end

  def before_destroy
@ -930,6 +930,12 @@ class Site < Sequel::Model
    File.join TEMPLATE_ROOT, name
  end

+  def current_base_files_path(name=username)
+    raise 'username missing' if name.nil? || name.empty?
+    return File.join BANNED_SITES_ROOT, name if is_banned
+    base_files_path name
+  end
+
  def base_files_path(name=username)
    raise 'username missing' if name.nil? || name.empty?
    File.join SITE_FILES_ROOT, name
@ -950,6 +956,10 @@ class Site < Sequel::Model
    clean.join '/'
  end

+  def current_files_path(path='')
+    File.join current_base_files_path, scrubbed_path(path)
+  end
+
  def files_path(path='')
    File.join base_files_path, scrubbed_path(path)
  end
@ -1235,6 +1245,45 @@ class Site < Sequel::Model
    !site_files_dataset.where(path: /^\/?index.html$/).where(sha1_hash: EMPTY_FILE_HASH).first.nil?
  end

+  def classify(path)
+    return nil unless classification_allowed? path
+    $classifier.classify process_for_classification(path)
+  end
+
+  def classification_scores(path)
+    return nil unless classification_allowed? path
+    $classifier.classification_scores process_for_classification(path)
+  end
+
+  def train(path, category='ham')
+    return nil unless classification_allowed? path
+    $trainer.train(category, process_for_classification(path))
+    site_file = site_files_dataset.where(path: path).first
+    site_file.classifier = category
+    site_file.save_changes validate: false
+  end
+
+  def untrain(path, category='ham')
+    return nil unless classification_allowed? path
+    $trainer.untrain(category, process_for_classification(path))
+    site_file = site_files_dataset.where(path: path).first
+    site_file.classifier = category
+    site_file.save_changes validate: false
+  end
+
+  def classification_allowed?(path)
+    site_file = site_files_dataset.where(path: path).first
+    return false if site_file.is_directory
+    return false if site_file.size > SiteFile::CLASSIFIER_LIMIT
+    return false if !path.match(/\.html$/)
+    true
+  end
+
+  def process_for_classification(path)
+    sanitized = Sanitize.fragment get_file(path)
+    sanitized.gsub(/(http|https):\/\//, '').gsub(/[^\w\s]/, '').downcase.split.uniq.select{|v| v.length < SiteFile::CLASSIFIER_WORD_LIMIT}.join(' ')
+  end
+
  # array of hashes: filename, tempfile, opts.
  def store_files(files, opts={})
    results = []