mirror of
https://github.com/neocities/neocities.git
synced 2025-04-24 17:22:35 +02:00
experimental statistical language classification system
This commit is contained in:
parent
9cc85a48bb
commit
242cc6e92d
15 changed files with 178 additions and 37 deletions
2
Gemfile
2
Gemfile
|
@ -32,6 +32,8 @@ gem 'rye'
|
|||
gem 'dnsruby'
|
||||
gem 'base32'
|
||||
gem 'coveralls', require: false
|
||||
gem 'sanitize'
|
||||
gem 'linnaeus', git: 'https://github.com/neocities/linnaeus.git', branch: 'soften_redis_gemspec'
|
||||
|
||||
platform :mri, :rbx do
|
||||
gem 'magic' # sudo apt-get install file, For OSX: brew install libmagic
|
||||
|
|
11
Rakefile
11
Rakefile
|
@ -291,3 +291,14 @@ task :update_screenshots => [:environment] do
|
|||
}
|
||||
end
|
||||
=end
|
||||
|
||||
desc 'train_classifier'
|
||||
task :train_classifier => [:environment] do
|
||||
Site.select(:id, :username).where(is_banned: false, is_deleted: false).all.each do |site|
|
||||
html_files = site.site_files_dataset.where(path: /\.html$/).all
|
||||
|
||||
html_files.each do |file|
|
||||
site.train html_files.path
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -120,6 +120,11 @@ post '/admin/banhammer' do
|
|||
|
||||
site = Site[username: params[:username]]
|
||||
|
||||
if !params[:classifier].empty?
|
||||
site.untrain 'index.html'
|
||||
site.train 'index.html', params[:classifier]
|
||||
end
|
||||
|
||||
if site.nil?
|
||||
flash[:error] = 'User not found'
|
||||
redirect '/admin'
|
||||
|
|
|
@ -131,3 +131,6 @@ $country_codes = {}
|
|||
CSV.foreach("./files/country_codes.csv") do |row|
|
||||
$country_codes[row.last] = row.first
|
||||
end
|
||||
|
||||
$classifier = Linnaeus::Classifier.new redis_db: 1
|
||||
$trainer = Linnaeus::Trainer.new redis_db: 1
|
||||
|
|
9
migrations/083_add_classifiers.rb
Normal file
9
migrations/083_add_classifiers.rb
Normal file
|
@ -0,0 +1,9 @@
|
|||
Sequel.migration do
|
||||
up {
|
||||
DB.add_column :site_files, :classifier, :text, default: nil, index: true
|
||||
}
|
||||
|
||||
down {
|
||||
DB.drop_column :site_files, :classifier
|
||||
}
|
||||
end
|
|
@ -420,7 +420,7 @@ class Site < Sequel::Model
|
|||
end
|
||||
|
||||
def get_file(path)
|
||||
File.read files_path(path)
|
||||
File.read current_files_path(path)
|
||||
end
|
||||
|
||||
def before_destroy
|
||||
|
@ -930,6 +930,12 @@ class Site < Sequel::Model
|
|||
File.join TEMPLATE_ROOT, name
|
||||
end
|
||||
|
||||
def current_base_files_path(name=username)
|
||||
raise 'username missing' if name.nil? || name.empty?
|
||||
return File.join BANNED_SITES_ROOT, name if is_banned
|
||||
base_files_path name
|
||||
end
|
||||
|
||||
def base_files_path(name=username)
|
||||
raise 'username missing' if name.nil? || name.empty?
|
||||
File.join SITE_FILES_ROOT, name
|
||||
|
@ -950,6 +956,10 @@ class Site < Sequel::Model
|
|||
clean.join '/'
|
||||
end
|
||||
|
||||
def current_files_path(path='')
|
||||
File.join current_base_files_path, scrubbed_path(path)
|
||||
end
|
||||
|
||||
def files_path(path='')
|
||||
File.join base_files_path, scrubbed_path(path)
|
||||
end
|
||||
|
@ -1235,6 +1245,45 @@ class Site < Sequel::Model
|
|||
!site_files_dataset.where(path: /^\/?index.html$/).where(sha1_hash: EMPTY_FILE_HASH).first.nil?
|
||||
end
|
||||
|
||||
def classify(path)
|
||||
return nil unless classification_allowed? path
|
||||
$classifier.classify process_for_classification(path)
|
||||
end
|
||||
|
||||
def classification_scores(path)
|
||||
return nil unless classification_allowed? path
|
||||
$classifier.classification_scores process_for_classification(path)
|
||||
end
|
||||
|
||||
def train(path, category='ham')
|
||||
return nil unless classification_allowed? path
|
||||
$trainer.train(category, process_for_classification(path))
|
||||
site_file = site_files_dataset.where(path: path).first
|
||||
site_file.classifier = category
|
||||
site_file.save_changes validate: false
|
||||
end
|
||||
|
||||
def untrain(path, category='ham')
|
||||
return nil unless classification_allowed? path
|
||||
$trainer.untrain(category, process_for_classification(path))
|
||||
site_file = site_files_dataset.where(path: path).first
|
||||
site_file.classifier = category
|
||||
site_file.save_changes validate: false
|
||||
end
|
||||
|
||||
def classification_allowed?(path)
|
||||
site_file = site_files_dataset.where(path: path).first
|
||||
return false if site_file.is_directory
|
||||
return false if site_file.size > SiteFile::CLASSIFIER_LIMIT
|
||||
return false if !path.match(/\.html$/)
|
||||
true
|
||||
end
|
||||
|
||||
def process_for_classification(path)
|
||||
sanitized = Sanitize.fragment get_file(path)
|
||||
sanitized.gsub(/(http|https):\/\//, '').gsub(/[^\w\s]/, '').downcase.split.uniq.select{|v| v.length < SiteFile::CLASSIFIER_WORD_LIMIT}.join(' ')
|
||||
end
|
||||
|
||||
# array of hashes: filename, tempfile, opts.
|
||||
def store_files(files, opts={})
|
||||
results = []
|
||||
|
|
|
@ -1,4 +1,9 @@
|
|||
require 'sanitize'
|
||||
require 'linnaeus'
|
||||
|
||||
class SiteFile < Sequel::Model
|
||||
CLASSIFIER_LIMIT = 1_000_000.freeze
|
||||
CLASSIFIER_WORD_LIMIT = 25.freeze
|
||||
unrestrict_primary_key
|
||||
plugin :update_primary_key
|
||||
many_to_one :site
|
||||
|
|
46
tests/acceptance/browse_tests.rb
Normal file
46
tests/acceptance/browse_tests.rb
Normal file
|
@ -0,0 +1,46 @@
|
|||
require_relative './environment.rb'
|
||||
|
||||
describe '/browse' do
|
||||
include Capybara::DSL
|
||||
|
||||
describe 'as admin' do
|
||||
before do
|
||||
DB[:sites_tags].delete
|
||||
DB[:sites].delete
|
||||
Capybara.reset_sessions!
|
||||
@admin = Fabricate :site, is_admin: true
|
||||
@site = Fabricate :site, site_changed: true
|
||||
page.set_rack_session id: @admin.id
|
||||
end
|
||||
|
||||
it 'bans from browse for admin' do
|
||||
visit '/browse?sort_by=newest'
|
||||
within(".website-Gallery li#username_#{@site.username}") do
|
||||
click_button 'Ban'
|
||||
end
|
||||
|
||||
@site.reload.is_banned.must_equal true
|
||||
@admin.reload.is_banned.must_equal false
|
||||
end
|
||||
|
||||
it 'bans for spam' do
|
||||
visit '/browse?sort_by=newest'
|
||||
within(".website-Gallery li#username_#{@site.username}") do
|
||||
click_button 'Spam'
|
||||
end
|
||||
|
||||
@site.reload.is_banned.must_equal true
|
||||
@site.site_files_dataset.where(path: 'index.html').first.classifier.must_equal 'spam'
|
||||
end
|
||||
|
||||
it 'bans for phishing' do
|
||||
visit '/browse?sort_by=newest'
|
||||
within(".website-Gallery li#username_#{@site.username}") do
|
||||
click_button 'Phishing'
|
||||
end
|
||||
|
||||
@site.reload.is_banned.must_equal true
|
||||
@site.site_files_dataset.where(path: 'index.html').first.classifier.must_equal 'phishing'
|
||||
end
|
||||
end
|
||||
end
|
1
tests/files/classifier/ham.html
Normal file
1
tests/files/classifier/ham.html
Normal file
|
@ -0,0 +1 @@
|
|||
I am a piece of ham.
|
1
tests/files/classifier/phishing.html
Normal file
1
tests/files/classifier/phishing.html
Normal file
|
@ -0,0 +1 @@
|
|||
Facebook login enter your password derrp
|
1
tests/files/classifier/spam.html
Normal file
1
tests/files/classifier/spam.html
Normal file
|
@ -0,0 +1 @@
|
|||
Ham sucks. How would you like to buy some spam?
|
|
@ -329,5 +329,25 @@ describe 'site_files' do
|
|||
upload 'files[]' => Rack::Test::UploadedFile.new('./tests/files/index.html', 'text/html')
|
||||
@site.reload.changed_count.must_equal 2
|
||||
end
|
||||
|
||||
describe 'classification' do
|
||||
before do
|
||||
$trainer.instance_variable_get('@db').redis.flushall
|
||||
end
|
||||
|
||||
it 'trains files' do
|
||||
upload 'files[]' => Rack::Test::UploadedFile.new('./tests/files/classifier/ham.html', 'text/html')
|
||||
upload 'files[]' => Rack::Test::UploadedFile.new('./tests/files/classifier/spam.html', 'text/html')
|
||||
upload 'files[]' => Rack::Test::UploadedFile.new('./tests/files/classifier/phishing.html', 'text/html')
|
||||
|
||||
@site.train 'ham.html'
|
||||
@site.train 'spam.html', 'spam'
|
||||
@site.train 'phishing.html', 'phishing'
|
||||
|
||||
@site.classify('ham.html').must_equal 'ham'
|
||||
@site.classify('spam.html').must_equal 'spam'
|
||||
@site.classify('phishing.html').must_equal 'phishing'
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -5,6 +5,15 @@ def app
|
|||
end
|
||||
|
||||
describe Site do
|
||||
describe 'banning' do
|
||||
it 'still makes files available' do
|
||||
site = Fabricate :site
|
||||
site.ban!
|
||||
File.exist?(site.current_files_path('index.html')).must_equal true
|
||||
site.current_files_path('index.html').must_equal File.join(Site::BANNED_SITES_ROOT, site.username, 'index.html')
|
||||
end
|
||||
end
|
||||
|
||||
describe 'directory create' do
|
||||
it 'handles wacky pathnames' do
|
||||
['/derp', '/derp/'].each do |path|
|
||||
|
|
|
@ -75,7 +75,7 @@
|
|||
<% else %>
|
||||
<ul class="row website-Gallery content int-Gall">
|
||||
<% @sites.each_with_index do |site,i| %>
|
||||
<li>
|
||||
<li id="username_<%= site.username %>">
|
||||
<a href="<%= site.uri %>" class="neo-Screen-Shot" title="<%= site.title %>" onclick="surf(<%= ((@current_page-1)*Site::BROWSE_PAGINATION_LENGTH)+i+1 %>); return false">
|
||||
<span class="img-Holder" style="background:url(<%= site.screenshot_url('index.html', '540x405') %>) no-repeat;">
|
||||
<img src="/img/placeholder.png" alt="<%= site.title %>" />
|
||||
|
@ -119,6 +119,20 @@
|
|||
<button>Ban</button>
|
||||
</form>
|
||||
|
||||
<form action="/admin/banhammer" target="_blank" method="POST" onsubmit="return confirm('Confirm ban of <%= site.username %>');">
|
||||
<%== csrf_token_input_html %>
|
||||
<input type="hidden" name="username" value="<%= site.username %>">
|
||||
<input type="hidden" name="classifier" value="spam">
|
||||
<button>Spam</button>
|
||||
</form>
|
||||
|
||||
<form action="/admin/banhammer" target="_blank" method="POST" onsubmit="return confirm('Confirm ban of <%= site.username %>');">
|
||||
<%== csrf_token_input_html %>
|
||||
<input type="hidden" name="username" value="<%= site.username %>">
|
||||
<input type="hidden" name="classifier" value="phishing">
|
||||
<button>Phishing</button>
|
||||
</form>
|
||||
|
||||
<form action="/admin/mark_nsfw" target="_blank" method="POST" onsubmit="return confirm('Confirm NSFW marking of <%= site.username %>');">
|
||||
<%== csrf_token_input_html %>
|
||||
<input type="hidden" name="username" value="<%= site.username %>">
|
||||
|
|
|
@ -1,35 +0,0 @@
|
|||
<div class="header-Outro">
|
||||
<div class="row content single-Col">
|
||||
<h1>Neocities Hotlinking Policy</h1>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="content single-Col misc-page">
|
||||
<p>Neocities has measures in place to prevent hotlinking of content from non-Neocities sites.</p>
|
||||
|
||||
<h2>What is hotlinking, and why don't you allow it?</h2>
|
||||
<p>
|
||||
<a href="https://en.wikipedia.org/wiki/Inline_linking">Hotlinking</a> is when a non-HTML file hosted by Neocities is embedded into another site that is not hosted by Neocities.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
We don't allow this because the purpose of Neocities is not to be a file server, but to be a portal for people creating their own web sites. When Neocities is used as a file dump for other sites, it hurts Neocities sites by cutting into the amount of bandwidth available to them. We want to promote Neocities sites, and this allows us to focus our resources on serving them as fast as possible.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
If you have a site on another server, the best way to serve files for it is by putting them directly on that server. Not only does this ensure that the files will not disappear in the future, but it also makes it easier for you to backup that site independently.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
If you need to store files for hotlinking use, there are plenty of alternatives. <a href="http://imgur.com/">Imgur</a> for example is a place you can store images that can be served by anyone.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
Please keep in mind that our reasons for doing this are purely out of requirement. We need to focus on being sustainable to run Neocities without advertisers and stay independent, and preventing Neocites from being used as a "file dump host" is a part of that process.
|
||||
</p>
|
||||
|
||||
<h2>Is there a way to remove the hotlinking restriction?</h2>
|
||||
<p>
|
||||
The hotlinking policy currently applies to all sites. We can't currently make exceptions because of the way our proxy servers work. We may change this in the future.
|
||||
</p>
|
||||
</div>
|
Loading…
Add table
Reference in a new issue