diff --git a/app/jobs/generate_word_cloud_job.rb b/app/jobs/generate_word_cloud_job.rb index 159a1eaa3..583d32321 100644 --- a/app/jobs/generate_word_cloud_job.rb +++ b/app/jobs/generate_word_cloud_job.rb @@ -1,86 +1,116 @@ # Use Open3 to capture output in real-time require 'open3' +# Background job that generates a wordcloud image from domain names +# using an external Python script with progress tracking class GenerateWordCloudJob < ApplicationJob def perform(domains_file_path, user_id, config = {}) - # Set up progress tracking - progress_key = "wordcloud_progress:#{user_id}" - Rails.cache.write(progress_key, { status: 'processing', progress: 0 }) + @domains_file_path = domains_file_path + @user_id = user_id + @config = config + @progress_key = "wordcloud_progress:#{user_id}" + @wordcloud_dir = Rails.root.join('public', 'wordcloud') + @config_file_path = nil + + initialize_progress begin - # Ensure the wordcloud directory exists - wordcloud_dir = Rails.root.join('public', 'wordcloud') - FileUtils.mkdir_p(wordcloud_dir) unless Dir.exist?(wordcloud_dir) - - # Setup Python environment - python_executable = ENV.fetch('PYTHON_EXECUTABLE', 'python3') - script_path = Rails.root.join('lib', 'wordcloud', 'generate_wordcloud.py') - - # Create a config file for the Python script - config_file_path = Rails.root.join(wordcloud_dir, "wordcloud_config_#{Time.now.to_i}.json") - File.write(config_file_path, config.to_json) - - # Set environment variables to ensure proper encoding - env = { 'PYTHONIOENCODING' => 'utf-8', 'PYTHONUNBUFFERED' => '1' } - - # Debug information - # Rails.logger.info("Python executable: #{python_executable}") - # Rails.logger.info("Script path: #{script_path}") - # Rails.logger.info("Domains file: #{domains_file_path}") - # Rails.logger.info("Output directory: #{wordcloud_dir}") - - # Check if files exist - # Rails.logger.info("Script exists: #{File.exist?(script_path)}") - # Rails.logger.info("Domains file exists: #{File.exist?(domains_file_path)}") - - # Make script executable - FileUtils.chmod('+x', script_path) unless File.executable?(script_path) - - Open3.popen2e(env, python_executable, script_path.to_s, domains_file_path, wordcloud_dir.to_s, config_file_path.to_s) do |stdin, stdout_err, wait_thr| - # Close stdin since we don't need it - stdin.close - - # Process output line by line - while line = stdout_err.gets - # Parse progress from Python script output - if line =~ /Processing batch (\d+)\/(\d+)/ - current = $1.to_i - total = $2.to_i - progress = ((current.to_f / total) * 80).round - Rails.cache.write(progress_key, { status: 'processing', progress: progress }) - elsif line =~ /Total estimated cost/ - # Update when word extraction is complete - Rails.cache.write(progress_key, { status: 'processing', progress: 80 }) - elsif line =~ /Generating word cloud/ - # Update when word cloud generation starts - Rails.cache.write(progress_key, { status: 'processing', progress: 90 }) - end - - # Log output for debugging - Rails.logger.info("WordCloud: #{line.strip}") - end - - # Check if the process was successful - exit_status = wait_thr.value - if exit_status.success? - Rails.cache.write(progress_key, { status: 'completed', progress: 100 }) - else - Rails.cache.write(progress_key, { - status: 'failed', - progress: 0, - error: "Process failed with status #{exit_status.exitstatus}" - }) - end - end - - rescue => e - Rails.logger.error("Error in WordCloud job: #{e.message}") - Rails.logger.error(e.backtrace.join("\n")) - Rails.cache.write(progress_key, { status: 'failed', progress: 0, error: e.message }) + setup_environment + run_wordcloud_script + rescue StandardError => e + handle_error(e) ensure - # Clean up the config file - File.delete(config_file_path) if File.exist?(config_file_path) + cleanup end end -end + private + + def initialize_progress + Rails.cache.write(@progress_key, { status: 'processing', progress: 0 }) + end + + def setup_environment + # Ensure the wordcloud directory exists + FileUtils.mkdir_p(@wordcloud_dir) unless Dir.exist?(@wordcloud_dir) + + # Create a config file for the Python script + @config_file_path = Rails.root.join(@wordcloud_dir, "wordcloud_config_#{Time.now.to_i}.json") + File.write(@config_file_path, @config.to_json) + + # Setup Python script + @script_path = Rails.root.join('lib', 'wordcloud', 'generate_wordcloud.py') + FileUtils.chmod('+x', @script_path) unless File.executable?(@script_path) + end + + def run_wordcloud_script + python_executable = ENV.fetch('PYTHON_EXECUTABLE', 'python3') + env = { 'PYTHONIOENCODING' => 'utf-8', 'PYTHONUNBUFFERED' => '1' } + + Open3.popen2e(env, python_executable, @script_path.to_s, @domains_file_path, + @wordcloud_dir.to_s, @config_file_path.to_s) do |stdin, stdout_err, wait_thr| + stdin.close + process_script_output(stdout_err, wait_thr) + end + end + + def process_script_output(stdout_err, wait_thr) + # Process output line by line + while line = stdout_err.gets + update_progress_from_output(line) + Rails.logger.info("WordCloud: #{line.strip}") + end + + # Process exit status + handle_exit_status(wait_thr.value) + end + + def update_progress_from_output(line) + case line + when %r{Processing batch (\d+)/(\d+)} + current, total = $1.to_i, $2.to_i + progress = ((current.to_f / total) * 80).round + update_progress(progress) + when /Total estimated cost/ + update_progress(80) + when /Generating word cloud/ + update_progress(90) + end + end + + def update_progress(value, status: 'processing') + Rails.cache.write(@progress_key, { status: status, progress: value }) + end + + def handle_exit_status(exit_status) + if exit_status.success? + update_progress(100, status: 'completed') + else + Rails.cache.write( + @progress_key, + { + status: 'failed', + progress: 0, + error: "Process failed with status #{exit_status.exitstatus}" + } + ) + end + end + + def handle_error(exception) + Rails.logger.error("Error in WordCloud job: #{exception.message}") + Rails.logger.error(exception.backtrace.join("\n")) + Rails.cache.write( + @progress_key, + { + status: 'failed', + progress: 0, + error: exception.message + } + ) + end + + def cleanup + File.delete(@config_file_path) if @config_file_path && File.exist?(@config_file_path) + end +end diff --git a/public/wordcloud/config.json b/public/wordcloud/config.json index eb154f67b..5d77777ea 100644 --- a/public/wordcloud/config.json +++ b/public/wordcloud/config.json @@ -1 +1 @@ -{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","special_terms":["e-","i-","2-","3-","4-",".com","ai","web"]} \ No newline at end of file +{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]} \ No newline at end of file diff --git a/public/wordcloud/top_words.txt b/public/wordcloud/top_words.txt index 95311dabd..499ffd538 100644 --- a/public/wordcloud/top_words.txt +++ b/public/wordcloud/top_words.txt @@ -1,11 +1,11 @@ Top 10 most frequent words: 1. tr: 4 -2. auto: 4 -3. 2-: 4 -4. faktor: 4 +2. 2-: 4 +3. faktor: 4 +4. auto: 3 5. e-: 2 6. i-: 2 -7. digi: 2 -8. car: 2 +7. car: 2 +8. pood: 2 9. ai: 1 10. robot: 1 diff --git a/public/wordcloud/wordcloud.png b/public/wordcloud/wordcloud.png index 34e51ba71..ac296f3a9 100644 Binary files a/public/wordcloud/wordcloud.png and b/public/wordcloud/wordcloud.png differ