experimental statistical language classification system

This commit is contained in:
Kyle Drake 2015-12-28 14:32:43 -06:00
parent 9cc85a48bb
commit 242cc6e92d
15 changed files with 178 additions and 37 deletions

View file

@ -32,6 +32,8 @@ gem 'rye'
gem 'dnsruby' gem 'dnsruby'
gem 'base32' gem 'base32'
gem 'coveralls', require: false gem 'coveralls', require: false
gem 'sanitize'
gem 'linnaeus', git: 'https://github.com/neocities/linnaeus.git', branch: 'soften_redis_gemspec'
platform :mri, :rbx do platform :mri, :rbx do
gem 'magic' # sudo apt-get install file, For OSX: brew install libmagic gem 'magic' # sudo apt-get install file, For OSX: brew install libmagic

View file

@ -291,3 +291,14 @@ task :update_screenshots => [:environment] do
} }
end end
=end =end
desc 'train_classifier'
task :train_classifier => [:environment] do
Site.select(:id, :username).where(is_banned: false, is_deleted: false).all.each do |site|
html_files = site.site_files_dataset.where(path: /\.html$/).all
html_files.each do |file|
site.train html_files.path
end
end
end

View file

@ -120,6 +120,11 @@ post '/admin/banhammer' do
site = Site[username: params[:username]] site = Site[username: params[:username]]
if !params[:classifier].empty?
site.untrain 'index.html'
site.train 'index.html', params[:classifier]
end
if site.nil? if site.nil?
flash[:error] = 'User not found' flash[:error] = 'User not found'
redirect '/admin' redirect '/admin'

View file

@ -131,3 +131,6 @@ $country_codes = {}
CSV.foreach("./files/country_codes.csv") do |row| CSV.foreach("./files/country_codes.csv") do |row|
$country_codes[row.last] = row.first $country_codes[row.last] = row.first
end end
$classifier = Linnaeus::Classifier.new redis_db: 1
$trainer = Linnaeus::Trainer.new redis_db: 1

View file

@ -0,0 +1,9 @@
Sequel.migration do
up {
DB.add_column :site_files, :classifier, :text, default: nil, index: true
}
down {
DB.drop_column :site_files, :classifier
}
end

View file

@ -420,7 +420,7 @@ class Site < Sequel::Model
end end
def get_file(path) def get_file(path)
File.read files_path(path) File.read current_files_path(path)
end end
def before_destroy def before_destroy
@ -930,6 +930,12 @@ class Site < Sequel::Model
File.join TEMPLATE_ROOT, name File.join TEMPLATE_ROOT, name
end end
def current_base_files_path(name=username)
raise 'username missing' if name.nil? || name.empty?
return File.join BANNED_SITES_ROOT, name if is_banned
base_files_path name
end
def base_files_path(name=username) def base_files_path(name=username)
raise 'username missing' if name.nil? || name.empty? raise 'username missing' if name.nil? || name.empty?
File.join SITE_FILES_ROOT, name File.join SITE_FILES_ROOT, name
@ -950,6 +956,10 @@ class Site < Sequel::Model
clean.join '/' clean.join '/'
end end
def current_files_path(path='')
File.join current_base_files_path, scrubbed_path(path)
end
def files_path(path='') def files_path(path='')
File.join base_files_path, scrubbed_path(path) File.join base_files_path, scrubbed_path(path)
end end
@ -1235,6 +1245,45 @@ class Site < Sequel::Model
!site_files_dataset.where(path: /^\/?index.html$/).where(sha1_hash: EMPTY_FILE_HASH).first.nil? !site_files_dataset.where(path: /^\/?index.html$/).where(sha1_hash: EMPTY_FILE_HASH).first.nil?
end end
def classify(path)
return nil unless classification_allowed? path
$classifier.classify process_for_classification(path)
end
def classification_scores(path)
return nil unless classification_allowed? path
$classifier.classification_scores process_for_classification(path)
end
def train(path, category='ham')
return nil unless classification_allowed? path
$trainer.train(category, process_for_classification(path))
site_file = site_files_dataset.where(path: path).first
site_file.classifier = category
site_file.save_changes validate: false
end
def untrain(path, category='ham')
return nil unless classification_allowed? path
$trainer.untrain(category, process_for_classification(path))
site_file = site_files_dataset.where(path: path).first
site_file.classifier = category
site_file.save_changes validate: false
end
def classification_allowed?(path)
site_file = site_files_dataset.where(path: path).first
return false if site_file.is_directory
return false if site_file.size > SiteFile::CLASSIFIER_LIMIT
return false if !path.match(/\.html$/)
true
end
def process_for_classification(path)
sanitized = Sanitize.fragment get_file(path)
sanitized.gsub(/(http|https):\/\//, '').gsub(/[^\w\s]/, '').downcase.split.uniq.select{|v| v.length < SiteFile::CLASSIFIER_WORD_LIMIT}.join(' ')
end
# array of hashes: filename, tempfile, opts. # array of hashes: filename, tempfile, opts.
def store_files(files, opts={}) def store_files(files, opts={})
results = [] results = []

View file

@ -1,4 +1,9 @@
require 'sanitize'
require 'linnaeus'
class SiteFile < Sequel::Model class SiteFile < Sequel::Model
CLASSIFIER_LIMIT = 1_000_000.freeze
CLASSIFIER_WORD_LIMIT = 25.freeze
unrestrict_primary_key unrestrict_primary_key
plugin :update_primary_key plugin :update_primary_key
many_to_one :site many_to_one :site

View file

@ -0,0 +1,46 @@
require_relative './environment.rb'
describe '/browse' do
include Capybara::DSL
describe 'as admin' do
before do
DB[:sites_tags].delete
DB[:sites].delete
Capybara.reset_sessions!
@admin = Fabricate :site, is_admin: true
@site = Fabricate :site, site_changed: true
page.set_rack_session id: @admin.id
end
it 'bans from browse for admin' do
visit '/browse?sort_by=newest'
within(".website-Gallery li#username_#{@site.username}") do
click_button 'Ban'
end
@site.reload.is_banned.must_equal true
@admin.reload.is_banned.must_equal false
end
it 'bans for spam' do
visit '/browse?sort_by=newest'
within(".website-Gallery li#username_#{@site.username}") do
click_button 'Spam'
end
@site.reload.is_banned.must_equal true
@site.site_files_dataset.where(path: 'index.html').first.classifier.must_equal 'spam'
end
it 'bans for phishing' do
visit '/browse?sort_by=newest'
within(".website-Gallery li#username_#{@site.username}") do
click_button 'Phishing'
end
@site.reload.is_banned.must_equal true
@site.site_files_dataset.where(path: 'index.html').first.classifier.must_equal 'phishing'
end
end
end

View file

@ -0,0 +1 @@
I am a piece of ham.

View file

@ -0,0 +1 @@
Facebook login enter your password derrp

View file

@ -0,0 +1 @@
Ham sucks. How would you like to buy some spam?

View file

@ -329,5 +329,25 @@ describe 'site_files' do
upload 'files[]' => Rack::Test::UploadedFile.new('./tests/files/index.html', 'text/html') upload 'files[]' => Rack::Test::UploadedFile.new('./tests/files/index.html', 'text/html')
@site.reload.changed_count.must_equal 2 @site.reload.changed_count.must_equal 2
end end
describe 'classification' do
before do
$trainer.instance_variable_get('@db').redis.flushall
end
it 'trains files' do
upload 'files[]' => Rack::Test::UploadedFile.new('./tests/files/classifier/ham.html', 'text/html')
upload 'files[]' => Rack::Test::UploadedFile.new('./tests/files/classifier/spam.html', 'text/html')
upload 'files[]' => Rack::Test::UploadedFile.new('./tests/files/classifier/phishing.html', 'text/html')
@site.train 'ham.html'
@site.train 'spam.html', 'spam'
@site.train 'phishing.html', 'phishing'
@site.classify('ham.html').must_equal 'ham'
@site.classify('spam.html').must_equal 'spam'
@site.classify('phishing.html').must_equal 'phishing'
end
end
end end
end end

View file

@ -5,6 +5,15 @@ def app
end end
describe Site do describe Site do
describe 'banning' do
it 'still makes files available' do
site = Fabricate :site
site.ban!
File.exist?(site.current_files_path('index.html')).must_equal true
site.current_files_path('index.html').must_equal File.join(Site::BANNED_SITES_ROOT, site.username, 'index.html')
end
end
describe 'directory create' do describe 'directory create' do
it 'handles wacky pathnames' do it 'handles wacky pathnames' do
['/derp', '/derp/'].each do |path| ['/derp', '/derp/'].each do |path|

View file

@ -75,7 +75,7 @@
<% else %> <% else %>
<ul class="row website-Gallery content int-Gall"> <ul class="row website-Gallery content int-Gall">
<% @sites.each_with_index do |site,i| %> <% @sites.each_with_index do |site,i| %>
<li> <li id="username_<%= site.username %>">
<a href="<%= site.uri %>" class="neo-Screen-Shot" title="<%= site.title %>" onclick="surf(<%= ((@current_page-1)*Site::BROWSE_PAGINATION_LENGTH)+i+1 %>); return false"> <a href="<%= site.uri %>" class="neo-Screen-Shot" title="<%= site.title %>" onclick="surf(<%= ((@current_page-1)*Site::BROWSE_PAGINATION_LENGTH)+i+1 %>); return false">
<span class="img-Holder" style="background:url(<%= site.screenshot_url('index.html', '540x405') %>) no-repeat;"> <span class="img-Holder" style="background:url(<%= site.screenshot_url('index.html', '540x405') %>) no-repeat;">
<img src="/img/placeholder.png" alt="<%= site.title %>" /> <img src="/img/placeholder.png" alt="<%= site.title %>" />
@ -119,6 +119,20 @@
<button>Ban</button> <button>Ban</button>
</form> </form>
<form action="/admin/banhammer" target="_blank" method="POST" onsubmit="return confirm('Confirm ban of <%= site.username %>');">
<%== csrf_token_input_html %>
<input type="hidden" name="username" value="<%= site.username %>">
<input type="hidden" name="classifier" value="spam">
<button>Spam</button>
</form>
<form action="/admin/banhammer" target="_blank" method="POST" onsubmit="return confirm('Confirm ban of <%= site.username %>');">
<%== csrf_token_input_html %>
<input type="hidden" name="username" value="<%= site.username %>">
<input type="hidden" name="classifier" value="phishing">
<button>Phishing</button>
</form>
<form action="/admin/mark_nsfw" target="_blank" method="POST" onsubmit="return confirm('Confirm NSFW marking of <%= site.username %>');"> <form action="/admin/mark_nsfw" target="_blank" method="POST" onsubmit="return confirm('Confirm NSFW marking of <%= site.username %>');">
<%== csrf_token_input_html %> <%== csrf_token_input_html %>
<input type="hidden" name="username" value="<%= site.username %>"> <input type="hidden" name="username" value="<%= site.username %>">

View file

@ -1,35 +0,0 @@
<div class="header-Outro">
<div class="row content single-Col">
<h1>Neocities Hotlinking Policy</h1>
</div>
</div>
<div class="content single-Col misc-page">
<p>Neocities has measures in place to prevent hotlinking of content from non-Neocities sites.</p>
<h2>What is hotlinking, and why don't you allow it?</h2>
<p>
<a href="https://en.wikipedia.org/wiki/Inline_linking">Hotlinking</a> is when a non-HTML file hosted by Neocities is embedded into another site that is not hosted by Neocities.
</p>
<p>
We don't allow this because the purpose of Neocities is not to be a file server, but to be a portal for people creating their own web sites. When Neocities is used as a file dump for other sites, it hurts Neocities sites by cutting into the amount of bandwidth available to them. We want to promote Neocities sites, and this allows us to focus our resources on serving them as fast as possible.
</p>
<p>
If you have a site on another server, the best way to serve files for it is by putting them directly on that server. Not only does this ensure that the files will not disappear in the future, but it also makes it easier for you to backup that site independently.
</p>
<p>
If you need to store files for hotlinking use, there are plenty of alternatives. <a href="http://imgur.com/">Imgur</a> for example is a place you can store images that can be served by anyone.
</p>
<p>
Please keep in mind that our reasons for doing this are purely out of requirement. We need to focus on being sustainable to run Neocities without advertisers and stay independent, and preventing Neocites from being used as a "file dump host" is a part of that process.
</p>
<h2>Is there a way to remove the hotlinking restriction?</h2>
<p>
The hotlinking policy currently applies to all sites. We can't currently make exceptions because of the way our proxy servers work. We may change this in the future.
</p>
</div>