From 3bca5e883991fab7bebf001022e9af57bd928ddf Mon Sep 17 00:00:00 2001 From: Kyle Drake Date: Wed, 29 Apr 2015 18:18:02 -0700 Subject: [PATCH] Refactor logs, granularity to one day, with testing --- Rakefile | 36 +----- migrations/059_refactor_stats.rb | 55 ++++++++ models/site_file.rb | 3 +- models/stat.rb | 208 ++++++++++++++++++++++++++++++- models/stat_location.rb | 3 + models/stat_path.rb | 3 + models/stat_referrer.rb | 3 + tests/environment.rb | 2 +- tests/stat_tests.rb | 77 ++++++++++++ 9 files changed, 353 insertions(+), 37 deletions(-) create mode 100644 migrations/059_refactor_stats.rb create mode 100644 models/stat_location.rb create mode 100644 models/stat_path.rb create mode 100644 models/stat_referrer.rb create mode 100644 tests/stat_tests.rb diff --git a/Rakefile b/Rakefile index ecbb5742..d15b9229 100644 --- a/Rakefile +++ b/Rakefile @@ -31,39 +31,9 @@ end desc "parse logs" task :parse_logs => [:environment] do - Dir[File.join($config['logs_path'], '*.log')].each do |log_path| - hits = {} - visits = {} - visit_ips = {} - - logfile = File.open log_path, 'r' - - while hit = logfile.gets - time, username, size, path, ip = hit.split ' ' - - hits[username] ||= 0 - hits[username] += 1 - - visit_ips[username] = [] if !visit_ips[username] - - unless visit_ips[username].include?(ip) - visits[username] ||= 0 - visits[username] += 1 - visit_ips[username] << ip - end - end - - logfile.close - - hits.each do |username,hitcount| - DB['update sites set hits=hits+? where username=?', hitcount, username].first - end - - visits.each do |username,visitcount| - DB['update sites set views=views+? where username=?', visitcount, username].first - end - - FileUtils.rm log_path + Dir[File.join($config['logs_path'], '*.log')].each do |logfile_path| + Stat.parse logfile_path + FileUtils.rm logfile_path end end diff --git a/migrations/059_refactor_stats.rb b/migrations/059_refactor_stats.rb new file mode 100644 index 00000000..f9ead2af --- /dev/null +++ b/migrations/059_refactor_stats.rb @@ -0,0 +1,55 @@ +Sequel.migration do + up { + DB.drop_table :stats + DB.create_table! :stats do + primary_key :id + Integer :site_id, index: true + Date :created_at, index: true + Integer :hits, default: 0 + Integer :views, default: 0 + Integer :comments, default: 0 + Integer :follows, default: 0 + Integer :site_updates, default: 0 + end + + DB.create_table! :stat_referrers do + primary_key :id + Integer :stat_id, index: true + String :url + Integer :views, default: 0 + end + + DB.create_table! :stat_locations do + primary_key :id + Integer :stat_id, index: true + String :country_code2 + String :region_name + String :city_name + Decimal :latitude + Decimal :longitude + Integer :views, default: 0 + end + + DB.create_table! :stat_paths do + primary_key :id + Integer :stat_id, index: true + String :name + Integer :views, default: 0 + end + } + + down { + DB.drop_table :stats + DB.create_table! :stats do + primary_key :id + Integer :site_id, index: true + Integer :hits, default: 0 + Integer :views, default: 0 + DateTime :created_at, index: true + end + + DB.drop_table :stat_referrers + DB.drop_table :stat_locations + DB.drop_table :stat_paths + } +end diff --git a/models/site_file.rb b/models/site_file.rb index 7e714808..fa5db8d6 100644 --- a/models/site_file.rb +++ b/models/site_file.rb @@ -1,6 +1,5 @@ class SiteFile < Sequel::Model - unrestrict_primary_key plugin :update_primary_key many_to_one :site -end \ No newline at end of file +end diff --git a/models/stat.rb b/models/stat.rb index 007cf7eb..c6540427 100644 --- a/models/stat.rb +++ b/models/stat.rb @@ -1,3 +1,209 @@ class Stat < Sequel::Model + GEOCITY_PATH = './files/GeoLiteCity.dat' + many_to_one :site -end \ No newline at end of file + one_to_many :stat_referrers + one_to_many :stat_locations + one_to_many :stat_paths + + class << self + def parse_logfiles(path) + Dir["#{path}/*.log"].each do |log_path| + site_logs = {} + logfile = File.open log_path, 'r' + + while hit = logfile.gets + time, username, size, path, ip, referrer = hit.split ' ' + + next if referrer.match /bot/i + + site_logs[username] = { + hits: 0, + views: 0, + view_ips: [] + } unless site_logs[username] + + site_logs[username][:hits] += 1 + + unless site_logs[username][:view_ips].include?(ip) + site_logs[username][:views] += 1 + site_logs[username][:view_ips] << ip + end + end + + logfile.close + + current_time = Time.now.utc + current_day_string = current_time.to_date.to_s + + Site.select(:id, :username).where(username: site_logs.keys).all.each do |site| + site_logs[site.username][:id] = site.id + end + + DB.transaction do + site_logs.each do |username, site_log| + DB['update sites set hits=hits+?, views=views+? where username=?', + site_log[:hits], + site_log[:views], + username + ].first + + opts = {site_id: site_log[:id], created_at: current_day_string} + + stat = Stat.select(:id).where(opts).first + DB[:stats].lock('EXCLUSIVE') { stat = Stat.create opts } if stat.nil? + + DB[ + 'update stats set hits=hits+?, views=views+? where site_id=?', + site_log[:hits], + site_log[:views], + site_log[:id] + ].first + end + end + + FileUtils.rm log_path + end + end + + def get_or_create + DB[:stats].lock 'EXCLUSIVE' do + stat = Stat.where(opts).first + stat ||= Stat.new opts + stat.hits += site_log[:hits] + stat.views += site_log[:views] + end + end + end +end + +=begin +require 'io/extra' +require 'geoip' + +# Note: This isn't really a class right now. +module Stat + + + class << self + def parse_logfiles(path) + Dir["#{path}/*.log"].each do |logfile_path| + parse_logfile logfile_path + FileUtils.rm logfile_path + end + end + + def parse_logfile(path) + geoip = GeoIP.new GEOCITY_PATH + logfile = File.open path, 'r' + + hits = [] + + while hit = logfile.gets + time, username, size, path, ip, referrer = hit.split ' ' + + site = Site.select(:id).where(username: username).first + next unless site + + paths_dataset = StatsDB[:paths] + path_record = paths_dataset[name: path] + path_id = path_record ? path_record[:id] : paths_dataset.insert(name: path) + + referrers_dataset = StatsDB[:referrers] + referrer_record = referrers_dataset[name: referrer] + referrer_id = referrer_record ? referrer_record[:id] : referrers_dataset.insert(name: referrer) + + location_id = nil + + if city = geoip.city(ip) + locations_dataset = StatsDB[:locations].select(:id) + location_hash = {country_code2: city.country_code2, region_name: city.region_name, city_name: city.city_name} + + location = locations_dataset.where(location_hash).first + location_id = location ? location[:id] : locations_dataset.insert(location_hash) + end + + hits << [site.id, referrer_id, path_id, location_id, size, time] + end + + StatsDB[:hits].import( + [:site_id, :referrer_id, :path_id, :location_id, :bytes_sent, :logged_at], + hits + ) + end + end +end + + + + +=begin + def parse_logfile(path) + hits = {} + visits = {} + visit_ips = {} + + logfile = File.open path, 'r' + + while hit = logfile.gets + time, username, size, path, ip, referrer = hit.split ' ' + + hits[username] ||= 0 + hits[username] += 1 + visit_ips[username] = [] if !visit_ips[username] + + unless visit_ips[username].include? ip + visits[username] ||= 0 + visits[username] += 1 + visit_ips[username] << ip + end + end + + logfile.close + + + hits.each do |username,hitcount| + DB['update sites set hits=hits+? where username=?', hitcount, username].first + end + + visits.each do |username,visitcount| + DB['update sites set views=views+? where username=?', visitcount, username].first + end + end + end +=end + +=begin + def self.parse(logfile_path) + hits = {} + visits = {} + visit_ips = {} + + logfile = File.open logfile_path, 'r' + + while hit = logfile.gets + time, username, size, path, ip = hit.split ' ' + + hits[username] ||= 0 + hits[username] += 1 + + visit_ips[username] = [] if !visit_ips[username] + + unless visit_ips[username].include?(ip) + visits[username] ||= 0 + visits[username] += 1 + visit_ips[username] << ip + end + end + + logfile.close + + hits.each do |username,hitcount| + DB['update sites set hits=hits+? where username=?', hitcount, username].first + end + + visits.each do |username,visitcount| + DB['update sites set views=views+? where username=?', visitcount, username].first + end + end +=end diff --git a/models/stat_location.rb b/models/stat_location.rb new file mode 100644 index 00000000..df3fcb8c --- /dev/null +++ b/models/stat_location.rb @@ -0,0 +1,3 @@ +class StatLocation < Sequel::Model + many_to_one :stat +end diff --git a/models/stat_path.rb b/models/stat_path.rb new file mode 100644 index 00000000..0270d3f8 --- /dev/null +++ b/models/stat_path.rb @@ -0,0 +1,3 @@ +class StatPath < Sequel::Model + many_to_one :stat +end diff --git a/models/stat_referrer.rb b/models/stat_referrer.rb new file mode 100644 index 00000000..f17a33bd --- /dev/null +++ b/models/stat_referrer.rb @@ -0,0 +1,3 @@ +class StatReferrer < Sequel::Model + many_to_one :stat +end diff --git a/tests/environment.rb b/tests/environment.rb index 4cd4a7ac..3247bc9f 100644 --- a/tests/environment.rb +++ b/tests/environment.rb @@ -50,4 +50,4 @@ I18n.enforce_available_locales = true Mail.defaults do delivery_method :test -end \ No newline at end of file +end diff --git a/tests/stat_tests.rb b/tests/stat_tests.rb new file mode 100644 index 00000000..da579e3e --- /dev/null +++ b/tests/stat_tests.rb @@ -0,0 +1,77 @@ +require_relative './environment.rb' + +STAT_LOGS_PATH = 'tests/stat_logs' +STAT_LOGS_DIR_MATCH = "#{STAT_LOGS_PATH}/*.log" + +def log(&block) + File.open("tests/stat_logs/#{SecureRandom.uuid}.log", 'w') do |f| + yield f + end +end + +def random_time + (Time.now - rand(5000)).iso8601 +end + +describe 'stats' do + before do + Dir[STAT_LOGS_DIR_MATCH].each {|f| FileUtils.rm f} + @site_one = Fabricate :site + @site_two = Fabricate :site + + @t = Time.now.iso8601 + @s1u = @site_one.username + @s2u = @site_two.username + end + + it 'parses multiple sets of logs' do + geoip = GeoIP.new Stat::GEOCITY_PATH + + paths = ["/", "/#{SecureRandom.hex}", "/#{SecureRandom.hex}"] + cities = [geoip.city('67.180.75.140'), geoip.city('172.56.16.152')] + referrers = ['-', "http://#{@site_one.host}", "https://#{@site_one.host}", "http://insaneclownpossee.com"] + sites = [@site_one, @site_two] + + test_hits = [] + + 100.times { |i| + test_hits.push({ + time: random_time, + username: sites[rand(sites.length)].username, + size: rand(5000), + path: paths[rand(paths.length)], + ip: i.odd? ? cities.first.ip : cities.last.ip, + referrer: referrers[rand(referrers.length)] + }) + } + + log do |f| + test_hits.each {|h| f.puts "#{h[:time]} #{h[:username]} #{h[:size]} #{h[:path]} #{h[:ip]} #{h[:referrer]}"} + end + + Stat.parse_logfiles STAT_LOGS_PATH + + Dir["#{STAT_LOGS_PATH}/*.log"].length.must_equal 0 + + sites_total = 0 + [@site_one, @site_two].each do |site| + site.reload + sites_total += site.hits + site.views.must_equal 2 + end + + sites_total.must_equal 100 + + stats = Stat.where(site_id: [@site_one.id, @site_two.id]).all + stats.length.must_equal 2 + + stats.collect {|stat| stat.hits}.inject{|sum,x| sum + x }.must_equal 100 + stats.collect {|stat| stat.views}.inject{|sum,x| sum + x }.must_equal 4 + + sites.each do |site| + test_hits.select {|h| h[:username] == site.username}.length.must_equal( + stats.select {|s| s.site.username == site.username}.first.hits + ) + end + end +end