mirror of
https://github.com/internetee/registry.git
synced 2025-08-17 15:03:59 +02:00
Refactored GenerateWordCloudJob
This commit is contained in:
parent
ee3ec443b3
commit
90f21a3be6
4 changed files with 111 additions and 81 deletions
|
@ -1,86 +1,116 @@
|
||||||
# Use Open3 to capture output in real-time
|
# Use Open3 to capture output in real-time
|
||||||
require 'open3'
|
require 'open3'
|
||||||
|
|
||||||
|
# Background job that generates a wordcloud image from domain names
|
||||||
|
# using an external Python script with progress tracking
|
||||||
class GenerateWordCloudJob < ApplicationJob
|
class GenerateWordCloudJob < ApplicationJob
|
||||||
def perform(domains_file_path, user_id, config = {})
|
def perform(domains_file_path, user_id, config = {})
|
||||||
# Set up progress tracking
|
@domains_file_path = domains_file_path
|
||||||
progress_key = "wordcloud_progress:#{user_id}"
|
@user_id = user_id
|
||||||
Rails.cache.write(progress_key, { status: 'processing', progress: 0 })
|
@config = config
|
||||||
|
@progress_key = "wordcloud_progress:#{user_id}"
|
||||||
|
@wordcloud_dir = Rails.root.join('public', 'wordcloud')
|
||||||
|
@config_file_path = nil
|
||||||
|
|
||||||
|
initialize_progress
|
||||||
|
|
||||||
begin
|
begin
|
||||||
# Ensure the wordcloud directory exists
|
setup_environment
|
||||||
wordcloud_dir = Rails.root.join('public', 'wordcloud')
|
run_wordcloud_script
|
||||||
FileUtils.mkdir_p(wordcloud_dir) unless Dir.exist?(wordcloud_dir)
|
rescue StandardError => e
|
||||||
|
handle_error(e)
|
||||||
# Setup Python environment
|
|
||||||
python_executable = ENV.fetch('PYTHON_EXECUTABLE', 'python3')
|
|
||||||
script_path = Rails.root.join('lib', 'wordcloud', 'generate_wordcloud.py')
|
|
||||||
|
|
||||||
# Create a config file for the Python script
|
|
||||||
config_file_path = Rails.root.join(wordcloud_dir, "wordcloud_config_#{Time.now.to_i}.json")
|
|
||||||
File.write(config_file_path, config.to_json)
|
|
||||||
|
|
||||||
# Set environment variables to ensure proper encoding
|
|
||||||
env = { 'PYTHONIOENCODING' => 'utf-8', 'PYTHONUNBUFFERED' => '1' }
|
|
||||||
|
|
||||||
# Debug information
|
|
||||||
# Rails.logger.info("Python executable: #{python_executable}")
|
|
||||||
# Rails.logger.info("Script path: #{script_path}")
|
|
||||||
# Rails.logger.info("Domains file: #{domains_file_path}")
|
|
||||||
# Rails.logger.info("Output directory: #{wordcloud_dir}")
|
|
||||||
|
|
||||||
# Check if files exist
|
|
||||||
# Rails.logger.info("Script exists: #{File.exist?(script_path)}")
|
|
||||||
# Rails.logger.info("Domains file exists: #{File.exist?(domains_file_path)}")
|
|
||||||
|
|
||||||
# Make script executable
|
|
||||||
FileUtils.chmod('+x', script_path) unless File.executable?(script_path)
|
|
||||||
|
|
||||||
Open3.popen2e(env, python_executable, script_path.to_s, domains_file_path, wordcloud_dir.to_s, config_file_path.to_s) do |stdin, stdout_err, wait_thr|
|
|
||||||
# Close stdin since we don't need it
|
|
||||||
stdin.close
|
|
||||||
|
|
||||||
# Process output line by line
|
|
||||||
while line = stdout_err.gets
|
|
||||||
# Parse progress from Python script output
|
|
||||||
if line =~ /Processing batch (\d+)\/(\d+)/
|
|
||||||
current = $1.to_i
|
|
||||||
total = $2.to_i
|
|
||||||
progress = ((current.to_f / total) * 80).round
|
|
||||||
Rails.cache.write(progress_key, { status: 'processing', progress: progress })
|
|
||||||
elsif line =~ /Total estimated cost/
|
|
||||||
# Update when word extraction is complete
|
|
||||||
Rails.cache.write(progress_key, { status: 'processing', progress: 80 })
|
|
||||||
elsif line =~ /Generating word cloud/
|
|
||||||
# Update when word cloud generation starts
|
|
||||||
Rails.cache.write(progress_key, { status: 'processing', progress: 90 })
|
|
||||||
end
|
|
||||||
|
|
||||||
# Log output for debugging
|
|
||||||
Rails.logger.info("WordCloud: #{line.strip}")
|
|
||||||
end
|
|
||||||
|
|
||||||
# Check if the process was successful
|
|
||||||
exit_status = wait_thr.value
|
|
||||||
if exit_status.success?
|
|
||||||
Rails.cache.write(progress_key, { status: 'completed', progress: 100 })
|
|
||||||
else
|
|
||||||
Rails.cache.write(progress_key, {
|
|
||||||
status: 'failed',
|
|
||||||
progress: 0,
|
|
||||||
error: "Process failed with status #{exit_status.exitstatus}"
|
|
||||||
})
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
rescue => e
|
|
||||||
Rails.logger.error("Error in WordCloud job: #{e.message}")
|
|
||||||
Rails.logger.error(e.backtrace.join("\n"))
|
|
||||||
Rails.cache.write(progress_key, { status: 'failed', progress: 0, error: e.message })
|
|
||||||
ensure
|
ensure
|
||||||
# Clean up the config file
|
cleanup
|
||||||
File.delete(config_file_path) if File.exist?(config_file_path)
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def initialize_progress
|
||||||
|
Rails.cache.write(@progress_key, { status: 'processing', progress: 0 })
|
||||||
|
end
|
||||||
|
|
||||||
|
def setup_environment
|
||||||
|
# Ensure the wordcloud directory exists
|
||||||
|
FileUtils.mkdir_p(@wordcloud_dir) unless Dir.exist?(@wordcloud_dir)
|
||||||
|
|
||||||
|
# Create a config file for the Python script
|
||||||
|
@config_file_path = Rails.root.join(@wordcloud_dir, "wordcloud_config_#{Time.now.to_i}.json")
|
||||||
|
File.write(@config_file_path, @config.to_json)
|
||||||
|
|
||||||
|
# Setup Python script
|
||||||
|
@script_path = Rails.root.join('lib', 'wordcloud', 'generate_wordcloud.py')
|
||||||
|
FileUtils.chmod('+x', @script_path) unless File.executable?(@script_path)
|
||||||
|
end
|
||||||
|
|
||||||
|
def run_wordcloud_script
|
||||||
|
python_executable = ENV.fetch('PYTHON_EXECUTABLE', 'python3')
|
||||||
|
env = { 'PYTHONIOENCODING' => 'utf-8', 'PYTHONUNBUFFERED' => '1' }
|
||||||
|
|
||||||
|
Open3.popen2e(env, python_executable, @script_path.to_s, @domains_file_path,
|
||||||
|
@wordcloud_dir.to_s, @config_file_path.to_s) do |stdin, stdout_err, wait_thr|
|
||||||
|
stdin.close
|
||||||
|
process_script_output(stdout_err, wait_thr)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def process_script_output(stdout_err, wait_thr)
|
||||||
|
# Process output line by line
|
||||||
|
while line = stdout_err.gets
|
||||||
|
update_progress_from_output(line)
|
||||||
|
Rails.logger.info("WordCloud: #{line.strip}")
|
||||||
|
end
|
||||||
|
|
||||||
|
# Process exit status
|
||||||
|
handle_exit_status(wait_thr.value)
|
||||||
|
end
|
||||||
|
|
||||||
|
def update_progress_from_output(line)
|
||||||
|
case line
|
||||||
|
when %r{Processing batch (\d+)/(\d+)}
|
||||||
|
current, total = $1.to_i, $2.to_i
|
||||||
|
progress = ((current.to_f / total) * 80).round
|
||||||
|
update_progress(progress)
|
||||||
|
when /Total estimated cost/
|
||||||
|
update_progress(80)
|
||||||
|
when /Generating word cloud/
|
||||||
|
update_progress(90)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def update_progress(value, status: 'processing')
|
||||||
|
Rails.cache.write(@progress_key, { status: status, progress: value })
|
||||||
|
end
|
||||||
|
|
||||||
|
def handle_exit_status(exit_status)
|
||||||
|
if exit_status.success?
|
||||||
|
update_progress(100, status: 'completed')
|
||||||
|
else
|
||||||
|
Rails.cache.write(
|
||||||
|
@progress_key,
|
||||||
|
{
|
||||||
|
status: 'failed',
|
||||||
|
progress: 0,
|
||||||
|
error: "Process failed with status #{exit_status.exitstatus}"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def handle_error(exception)
|
||||||
|
Rails.logger.error("Error in WordCloud job: #{exception.message}")
|
||||||
|
Rails.logger.error(exception.backtrace.join("\n"))
|
||||||
|
Rails.cache.write(
|
||||||
|
@progress_key,
|
||||||
|
{
|
||||||
|
status: 'failed',
|
||||||
|
progress: 0,
|
||||||
|
error: exception.message
|
||||||
|
}
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
def cleanup
|
||||||
|
File.delete(@config_file_path) if @config_file_path && File.exist?(@config_file_path)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","special_terms":["e-","i-","2-","3-","4-",".com","ai","web"]}
|
{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]}
|
|
@ -1,11 +1,11 @@
|
||||||
Top 10 most frequent words:
|
Top 10 most frequent words:
|
||||||
1. tr: 4
|
1. tr: 4
|
||||||
2. auto: 4
|
2. 2-: 4
|
||||||
3. 2-: 4
|
3. faktor: 4
|
||||||
4. faktor: 4
|
4. auto: 3
|
||||||
5. e-: 2
|
5. e-: 2
|
||||||
6. i-: 2
|
6. i-: 2
|
||||||
7. digi: 2
|
7. car: 2
|
||||||
8. car: 2
|
8. pood: 2
|
||||||
9. ai: 1
|
9. ai: 1
|
||||||
10. robot: 1
|
10. robot: 1
|
||||||
|
|
Binary file not shown.
Before Width: | Height: | Size: 383 KiB After Width: | Height: | Size: 358 KiB |
Loading…
Add table
Add a link
Reference in a new issue