mirror of
https://github.com/neocities/neocities.git
synced 2025-08-03 00:02:00 +02:00
experimental statistical language classification system
This commit is contained in:
parent
9cc85a48bb
commit
242cc6e92d
15 changed files with 178 additions and 37 deletions
|
@ -420,7 +420,7 @@ class Site < Sequel::Model
|
|||
end
|
||||
|
||||
def get_file(path)
|
||||
File.read files_path(path)
|
||||
File.read current_files_path(path)
|
||||
end
|
||||
|
||||
def before_destroy
|
||||
|
@ -930,6 +930,12 @@ class Site < Sequel::Model
|
|||
File.join TEMPLATE_ROOT, name
|
||||
end
|
||||
|
||||
def current_base_files_path(name=username)
|
||||
raise 'username missing' if name.nil? || name.empty?
|
||||
return File.join BANNED_SITES_ROOT, name if is_banned
|
||||
base_files_path name
|
||||
end
|
||||
|
||||
def base_files_path(name=username)
|
||||
raise 'username missing' if name.nil? || name.empty?
|
||||
File.join SITE_FILES_ROOT, name
|
||||
|
@ -950,6 +956,10 @@ class Site < Sequel::Model
|
|||
clean.join '/'
|
||||
end
|
||||
|
||||
def current_files_path(path='')
|
||||
File.join current_base_files_path, scrubbed_path(path)
|
||||
end
|
||||
|
||||
def files_path(path='')
|
||||
File.join base_files_path, scrubbed_path(path)
|
||||
end
|
||||
|
@ -1235,6 +1245,45 @@ class Site < Sequel::Model
|
|||
!site_files_dataset.where(path: /^\/?index.html$/).where(sha1_hash: EMPTY_FILE_HASH).first.nil?
|
||||
end
|
||||
|
||||
def classify(path)
|
||||
return nil unless classification_allowed? path
|
||||
$classifier.classify process_for_classification(path)
|
||||
end
|
||||
|
||||
def classification_scores(path)
|
||||
return nil unless classification_allowed? path
|
||||
$classifier.classification_scores process_for_classification(path)
|
||||
end
|
||||
|
||||
def train(path, category='ham')
|
||||
return nil unless classification_allowed? path
|
||||
$trainer.train(category, process_for_classification(path))
|
||||
site_file = site_files_dataset.where(path: path).first
|
||||
site_file.classifier = category
|
||||
site_file.save_changes validate: false
|
||||
end
|
||||
|
||||
def untrain(path, category='ham')
|
||||
return nil unless classification_allowed? path
|
||||
$trainer.untrain(category, process_for_classification(path))
|
||||
site_file = site_files_dataset.where(path: path).first
|
||||
site_file.classifier = category
|
||||
site_file.save_changes validate: false
|
||||
end
|
||||
|
||||
def classification_allowed?(path)
|
||||
site_file = site_files_dataset.where(path: path).first
|
||||
return false if site_file.is_directory
|
||||
return false if site_file.size > SiteFile::CLASSIFIER_LIMIT
|
||||
return false if !path.match(/\.html$/)
|
||||
true
|
||||
end
|
||||
|
||||
def process_for_classification(path)
|
||||
sanitized = Sanitize.fragment get_file(path)
|
||||
sanitized.gsub(/(http|https):\/\//, '').gsub(/[^\w\s]/, '').downcase.split.uniq.select{|v| v.length < SiteFile::CLASSIFIER_WORD_LIMIT}.join(' ')
|
||||
end
|
||||
|
||||
# array of hashes: filename, tempfile, opts.
|
||||
def store_files(files, opts={})
|
||||
results = []
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue