From 242cc6e92db4a35fb122c39b0aac2cb059498c1b Mon Sep 17 00:00:00 2001 From: Kyle Drake Date: Mon, 28 Dec 2015 14:32:43 -0600 Subject: [PATCH] experimental statistical language classification system --- Gemfile | 2 ++ Rakefile | 11 ++++++ app/admin.rb | 5 +++ environment.rb | 3 ++ migrations/083_add_classifiers.rb | 9 +++++ models/site.rb | 51 +++++++++++++++++++++++++++- models/site_file.rb | 5 +++ tests/acceptance/browse_tests.rb | 46 +++++++++++++++++++++++++ tests/files/classifier/ham.html | 1 + tests/files/classifier/phishing.html | 1 + tests/files/classifier/spam.html | 1 + tests/site_file_tests.rb | 20 +++++++++++ tests/site_tests.rb | 9 +++++ views/browse.erb | 16 ++++++++- views/hotlinking.erb | 35 ------------------- 15 files changed, 178 insertions(+), 37 deletions(-) create mode 100644 migrations/083_add_classifiers.rb create mode 100644 tests/acceptance/browse_tests.rb create mode 100644 tests/files/classifier/ham.html create mode 100644 tests/files/classifier/phishing.html create mode 100644 tests/files/classifier/spam.html delete mode 100644 views/hotlinking.erb diff --git a/Gemfile b/Gemfile index dcf35e6f..0c5c45db 100644 --- a/Gemfile +++ b/Gemfile @@ -32,6 +32,8 @@ gem 'rye' gem 'dnsruby' gem 'base32' gem 'coveralls', require: false +gem 'sanitize' +gem 'linnaeus', git: 'https://github.com/neocities/linnaeus.git', branch: 'soften_redis_gemspec' platform :mri, :rbx do gem 'magic' # sudo apt-get install file, For OSX: brew install libmagic diff --git a/Rakefile b/Rakefile index 4ef0d3f3..5cfc0cc9 100644 --- a/Rakefile +++ b/Rakefile @@ -291,3 +291,14 @@ task :update_screenshots => [:environment] do } end =end + +desc 'train_classifier' +task :train_classifier => [:environment] do + Site.select(:id, :username).where(is_banned: false, is_deleted: false).all.each do |site| + html_files = site.site_files_dataset.where(path: /\.html$/).all + + html_files.each do |file| + site.train html_files.path + end + end +end diff --git a/app/admin.rb b/app/admin.rb index c16131be..a79fb9e5 100644 --- a/app/admin.rb +++ b/app/admin.rb @@ -120,6 +120,11 @@ post '/admin/banhammer' do site = Site[username: params[:username]] + if !params[:classifier].empty? + site.untrain 'index.html' + site.train 'index.html', params[:classifier] + end + if site.nil? flash[:error] = 'User not found' redirect '/admin' diff --git a/environment.rb b/environment.rb index e47d2490..efaef8f0 100644 --- a/environment.rb +++ b/environment.rb @@ -131,3 +131,6 @@ $country_codes = {} CSV.foreach("./files/country_codes.csv") do |row| $country_codes[row.last] = row.first end + +$classifier = Linnaeus::Classifier.new redis_db: 1 +$trainer = Linnaeus::Trainer.new redis_db: 1 diff --git a/migrations/083_add_classifiers.rb b/migrations/083_add_classifiers.rb new file mode 100644 index 00000000..9292780b --- /dev/null +++ b/migrations/083_add_classifiers.rb @@ -0,0 +1,9 @@ +Sequel.migration do + up { + DB.add_column :site_files, :classifier, :text, default: nil, index: true + } + + down { + DB.drop_column :site_files, :classifier + } +end diff --git a/models/site.rb b/models/site.rb index 7dcbb4b8..f628a35d 100644 --- a/models/site.rb +++ b/models/site.rb @@ -420,7 +420,7 @@ class Site < Sequel::Model end def get_file(path) - File.read files_path(path) + File.read current_files_path(path) end def before_destroy @@ -930,6 +930,12 @@ class Site < Sequel::Model File.join TEMPLATE_ROOT, name end + def current_base_files_path(name=username) + raise 'username missing' if name.nil? || name.empty? + return File.join BANNED_SITES_ROOT, name if is_banned + base_files_path name + end + def base_files_path(name=username) raise 'username missing' if name.nil? || name.empty? File.join SITE_FILES_ROOT, name @@ -950,6 +956,10 @@ class Site < Sequel::Model clean.join '/' end + def current_files_path(path='') + File.join current_base_files_path, scrubbed_path(path) + end + def files_path(path='') File.join base_files_path, scrubbed_path(path) end @@ -1235,6 +1245,45 @@ class Site < Sequel::Model !site_files_dataset.where(path: /^\/?index.html$/).where(sha1_hash: EMPTY_FILE_HASH).first.nil? end + def classify(path) + return nil unless classification_allowed? path + $classifier.classify process_for_classification(path) + end + + def classification_scores(path) + return nil unless classification_allowed? path + $classifier.classification_scores process_for_classification(path) + end + + def train(path, category='ham') + return nil unless classification_allowed? path + $trainer.train(category, process_for_classification(path)) + site_file = site_files_dataset.where(path: path).first + site_file.classifier = category + site_file.save_changes validate: false + end + + def untrain(path, category='ham') + return nil unless classification_allowed? path + $trainer.untrain(category, process_for_classification(path)) + site_file = site_files_dataset.where(path: path).first + site_file.classifier = category + site_file.save_changes validate: false + end + + def classification_allowed?(path) + site_file = site_files_dataset.where(path: path).first + return false if site_file.is_directory + return false if site_file.size > SiteFile::CLASSIFIER_LIMIT + return false if !path.match(/\.html$/) + true + end + + def process_for_classification(path) + sanitized = Sanitize.fragment get_file(path) + sanitized.gsub(/(http|https):\/\//, '').gsub(/[^\w\s]/, '').downcase.split.uniq.select{|v| v.length < SiteFile::CLASSIFIER_WORD_LIMIT}.join(' ') + end + # array of hashes: filename, tempfile, opts. def store_files(files, opts={}) results = [] diff --git a/models/site_file.rb b/models/site_file.rb index e605a325..fe4c8343 100644 --- a/models/site_file.rb +++ b/models/site_file.rb @@ -1,4 +1,9 @@ +require 'sanitize' +require 'linnaeus' + class SiteFile < Sequel::Model + CLASSIFIER_LIMIT = 1_000_000.freeze + CLASSIFIER_WORD_LIMIT = 25.freeze unrestrict_primary_key plugin :update_primary_key many_to_one :site diff --git a/tests/acceptance/browse_tests.rb b/tests/acceptance/browse_tests.rb new file mode 100644 index 00000000..af8c390a --- /dev/null +++ b/tests/acceptance/browse_tests.rb @@ -0,0 +1,46 @@ +require_relative './environment.rb' + +describe '/browse' do + include Capybara::DSL + + describe 'as admin' do + before do + DB[:sites_tags].delete + DB[:sites].delete + Capybara.reset_sessions! + @admin = Fabricate :site, is_admin: true + @site = Fabricate :site, site_changed: true + page.set_rack_session id: @admin.id + end + + it 'bans from browse for admin' do + visit '/browse?sort_by=newest' + within(".website-Gallery li#username_#{@site.username}") do + click_button 'Ban' + end + + @site.reload.is_banned.must_equal true + @admin.reload.is_banned.must_equal false + end + + it 'bans for spam' do + visit '/browse?sort_by=newest' + within(".website-Gallery li#username_#{@site.username}") do + click_button 'Spam' + end + + @site.reload.is_banned.must_equal true + @site.site_files_dataset.where(path: 'index.html').first.classifier.must_equal 'spam' + end + + it 'bans for phishing' do + visit '/browse?sort_by=newest' + within(".website-Gallery li#username_#{@site.username}") do + click_button 'Phishing' + end + + @site.reload.is_banned.must_equal true + @site.site_files_dataset.where(path: 'index.html').first.classifier.must_equal 'phishing' + end + end +end diff --git a/tests/files/classifier/ham.html b/tests/files/classifier/ham.html new file mode 100644 index 00000000..fb2e3f75 --- /dev/null +++ b/tests/files/classifier/ham.html @@ -0,0 +1 @@ +I am a piece of ham. diff --git a/tests/files/classifier/phishing.html b/tests/files/classifier/phishing.html new file mode 100644 index 00000000..b7a57acd --- /dev/null +++ b/tests/files/classifier/phishing.html @@ -0,0 +1 @@ +Facebook login enter your password derrp diff --git a/tests/files/classifier/spam.html b/tests/files/classifier/spam.html new file mode 100644 index 00000000..bea343a8 --- /dev/null +++ b/tests/files/classifier/spam.html @@ -0,0 +1 @@ +Ham sucks. How would you like to buy some spam? diff --git a/tests/site_file_tests.rb b/tests/site_file_tests.rb index b83c87c1..a303aa12 100644 --- a/tests/site_file_tests.rb +++ b/tests/site_file_tests.rb @@ -329,5 +329,25 @@ describe 'site_files' do upload 'files[]' => Rack::Test::UploadedFile.new('./tests/files/index.html', 'text/html') @site.reload.changed_count.must_equal 2 end + + describe 'classification' do + before do + $trainer.instance_variable_get('@db').redis.flushall + end + + it 'trains files' do + upload 'files[]' => Rack::Test::UploadedFile.new('./tests/files/classifier/ham.html', 'text/html') + upload 'files[]' => Rack::Test::UploadedFile.new('./tests/files/classifier/spam.html', 'text/html') + upload 'files[]' => Rack::Test::UploadedFile.new('./tests/files/classifier/phishing.html', 'text/html') + + @site.train 'ham.html' + @site.train 'spam.html', 'spam' + @site.train 'phishing.html', 'phishing' + + @site.classify('ham.html').must_equal 'ham' + @site.classify('spam.html').must_equal 'spam' + @site.classify('phishing.html').must_equal 'phishing' + end + end end end diff --git a/tests/site_tests.rb b/tests/site_tests.rb index ee79e74f..5cf387cb 100644 --- a/tests/site_tests.rb +++ b/tests/site_tests.rb @@ -5,6 +5,15 @@ def app end describe Site do + describe 'banning' do + it 'still makes files available' do + site = Fabricate :site + site.ban! + File.exist?(site.current_files_path('index.html')).must_equal true + site.current_files_path('index.html').must_equal File.join(Site::BANNED_SITES_ROOT, site.username, 'index.html') + end + end + describe 'directory create' do it 'handles wacky pathnames' do ['/derp', '/derp/'].each do |path| diff --git a/views/browse.erb b/views/browse.erb index 171f2b0e..6dbe0187 100644 --- a/views/browse.erb +++ b/views/browse.erb @@ -75,7 +75,7 @@ <% else %>