Refactored GenerateWordCloudJob

This commit is contained in:
tsoganov 2025-05-14 11:07:41 +03:00
parent ee3ec443b3
commit 90f21a3be6
4 changed files with 111 additions and 81 deletions

View file

@ -1,86 +1,116 @@
# Use Open3 to capture output in real-time
require 'open3'
# Background job that generates a wordcloud image from domain names
# using an external Python script with progress tracking
class GenerateWordCloudJob < ApplicationJob
def perform(domains_file_path, user_id, config = {})
# Set up progress tracking
progress_key = "wordcloud_progress:#{user_id}"
Rails.cache.write(progress_key, { status: 'processing', progress: 0 })
@domains_file_path = domains_file_path
@user_id = user_id
@config = config
@progress_key = "wordcloud_progress:#{user_id}"
@wordcloud_dir = Rails.root.join('public', 'wordcloud')
@config_file_path = nil
initialize_progress
begin
# Ensure the wordcloud directory exists
wordcloud_dir = Rails.root.join('public', 'wordcloud')
FileUtils.mkdir_p(wordcloud_dir) unless Dir.exist?(wordcloud_dir)
# Setup Python environment
python_executable = ENV.fetch('PYTHON_EXECUTABLE', 'python3')
script_path = Rails.root.join('lib', 'wordcloud', 'generate_wordcloud.py')
# Create a config file for the Python script
config_file_path = Rails.root.join(wordcloud_dir, "wordcloud_config_#{Time.now.to_i}.json")
File.write(config_file_path, config.to_json)
# Set environment variables to ensure proper encoding
env = { 'PYTHONIOENCODING' => 'utf-8', 'PYTHONUNBUFFERED' => '1' }
# Debug information
# Rails.logger.info("Python executable: #{python_executable}")
# Rails.logger.info("Script path: #{script_path}")
# Rails.logger.info("Domains file: #{domains_file_path}")
# Rails.logger.info("Output directory: #{wordcloud_dir}")
# Check if files exist
# Rails.logger.info("Script exists: #{File.exist?(script_path)}")
# Rails.logger.info("Domains file exists: #{File.exist?(domains_file_path)}")
# Make script executable
FileUtils.chmod('+x', script_path) unless File.executable?(script_path)
Open3.popen2e(env, python_executable, script_path.to_s, domains_file_path, wordcloud_dir.to_s, config_file_path.to_s) do |stdin, stdout_err, wait_thr|
# Close stdin since we don't need it
stdin.close
# Process output line by line
while line = stdout_err.gets
# Parse progress from Python script output
if line =~ /Processing batch (\d+)\/(\d+)/
current = $1.to_i
total = $2.to_i
progress = ((current.to_f / total) * 80).round
Rails.cache.write(progress_key, { status: 'processing', progress: progress })
elsif line =~ /Total estimated cost/
# Update when word extraction is complete
Rails.cache.write(progress_key, { status: 'processing', progress: 80 })
elsif line =~ /Generating word cloud/
# Update when word cloud generation starts
Rails.cache.write(progress_key, { status: 'processing', progress: 90 })
end
# Log output for debugging
Rails.logger.info("WordCloud: #{line.strip}")
end
# Check if the process was successful
exit_status = wait_thr.value
if exit_status.success?
Rails.cache.write(progress_key, { status: 'completed', progress: 100 })
else
Rails.cache.write(progress_key, {
status: 'failed',
progress: 0,
error: "Process failed with status #{exit_status.exitstatus}"
})
end
end
rescue => e
Rails.logger.error("Error in WordCloud job: #{e.message}")
Rails.logger.error(e.backtrace.join("\n"))
Rails.cache.write(progress_key, { status: 'failed', progress: 0, error: e.message })
setup_environment
run_wordcloud_script
rescue StandardError => e
handle_error(e)
ensure
# Clean up the config file
File.delete(config_file_path) if File.exist?(config_file_path)
cleanup
end
end
end
private
def initialize_progress
Rails.cache.write(@progress_key, { status: 'processing', progress: 0 })
end
def setup_environment
# Ensure the wordcloud directory exists
FileUtils.mkdir_p(@wordcloud_dir) unless Dir.exist?(@wordcloud_dir)
# Create a config file for the Python script
@config_file_path = Rails.root.join(@wordcloud_dir, "wordcloud_config_#{Time.now.to_i}.json")
File.write(@config_file_path, @config.to_json)
# Setup Python script
@script_path = Rails.root.join('lib', 'wordcloud', 'generate_wordcloud.py')
FileUtils.chmod('+x', @script_path) unless File.executable?(@script_path)
end
def run_wordcloud_script
python_executable = ENV.fetch('PYTHON_EXECUTABLE', 'python3')
env = { 'PYTHONIOENCODING' => 'utf-8', 'PYTHONUNBUFFERED' => '1' }
Open3.popen2e(env, python_executable, @script_path.to_s, @domains_file_path,
@wordcloud_dir.to_s, @config_file_path.to_s) do |stdin, stdout_err, wait_thr|
stdin.close
process_script_output(stdout_err, wait_thr)
end
end
def process_script_output(stdout_err, wait_thr)
# Process output line by line
while line = stdout_err.gets
update_progress_from_output(line)
Rails.logger.info("WordCloud: #{line.strip}")
end
# Process exit status
handle_exit_status(wait_thr.value)
end
def update_progress_from_output(line)
case line
when %r{Processing batch (\d+)/(\d+)}
current, total = $1.to_i, $2.to_i
progress = ((current.to_f / total) * 80).round
update_progress(progress)
when /Total estimated cost/
update_progress(80)
when /Generating word cloud/
update_progress(90)
end
end
def update_progress(value, status: 'processing')
Rails.cache.write(@progress_key, { status: status, progress: value })
end
def handle_exit_status(exit_status)
if exit_status.success?
update_progress(100, status: 'completed')
else
Rails.cache.write(
@progress_key,
{
status: 'failed',
progress: 0,
error: "Process failed with status #{exit_status.exitstatus}"
}
)
end
end
def handle_error(exception)
Rails.logger.error("Error in WordCloud job: #{exception.message}")
Rails.logger.error(exception.backtrace.join("\n"))
Rails.cache.write(
@progress_key,
{
status: 'failed',
progress: 0,
error: exception.message
}
)
end
def cleanup
File.delete(@config_file_path) if @config_file_path && File.exist?(@config_file_path)
end
end

View file

@ -1 +1 @@
{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","special_terms":["e-","i-","2-","3-","4-",".com","ai","web"]}
{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]}

View file

@ -1,11 +1,11 @@
Top 10 most frequent words:
1. tr: 4
2. auto: 4
3. 2-: 4
4. faktor: 4
2. 2-: 4
3. faktor: 4
4. auto: 3
5. e-: 2
6. i-: 2
7. digi: 2
8. car: 2
7. car: 2
8. pood: 2
9. ai: 1
10. robot: 1

Binary file not shown.

Before

Width:  |  Height:  |  Size: 383 KiB

After

Width:  |  Height:  |  Size: 358 KiB

Before After
Before After