Added wordcloud tool

This commit is contained in:
tsoganov 2025-05-14 10:56:58 +03:00
parent 0a06a36499
commit ee3ec443b3
20 changed files with 932 additions and 1 deletions

View file

@ -113,3 +113,6 @@ gem 'net-ftp'
# https://stackoverflow.com/questions/79360526/uninitialized-constant-activesupportloggerthreadsafelevellogger-nameerror
gem 'concurrent-ruby', '1.3.4'
# gives you access to stdin, stdout, and stderr when running other programs
gem 'open3'

View file

@ -394,6 +394,7 @@ GEM
omniauth-rails_csrf_protection (0.1.2)
actionpack (>= 4.2)
omniauth (>= 1.3.1)
open3 (0.2.1)
openid_connect (1.4.2)
activemodel
attr_required (>= 1.0.0)
@ -636,6 +637,7 @@ DEPENDENCIES
nokogiri (~> 1.16.0)
omniauth-rails_csrf_protection
omniauth-tara!
open3
openssl
paper_trail (~> 14.0)
pdfkit

View file

@ -0,0 +1,174 @@
# frozen_string_literal: true
module Admin
module Tools
# Controller for the admin wordcloud generator tool that creates visual representations
# of the most common words used in domain names
class WordcloudController < BaseController # rubocop:disable Metrics/ClassLength
WORDCLOUD_DIR = Rails.root.join('public', 'wordcloud')
WORDCLOUD_IMAGE_PATH = WORDCLOUD_DIR.join('wordcloud.png')
WORDCLOUD_CONFIG_PATH = WORDCLOUD_DIR.join('config.json')
TOP_WORDS_PATH = WORDCLOUD_DIR.join('top_words.txt')
before_action :authorize_admin
before_action :clear_cache, only: :create
before_action :ensure_wordcloud_dir, only: :create
def index
# Load configuration
@config = load_wordcloud_config
# Setup wordcloud data if image exists
if File.exist?(WORDCLOUD_IMAGE_PATH)
setup_wordcloud_data
else
@wordcloud_url = nil
end
end
def create
# Validate domains file
if params[:domains_file].present?
domains_file_path = process_uploaded_file(params[:domains_file])
return redirect_to admin_tools_wordcloud_path if domains_file_path.nil?
else
flash[:alert] = I18n.t('admin.tools.wordcloud_no_file')
return redirect_to admin_tools_wordcloud_path
end
# Collect and save configuration
config = build_config_from_params
File.write(WORDCLOUD_CONFIG_PATH, config.to_json)
# Start the background job
GenerateWordCloudJob.perform_later(domains_file_path.to_s, current_admin_user.id, config)
redirect_to progress_admin_tools_wordcloud_path
rescue StandardError => e
logger.error "Error starting wordcloud generation: #{e.message}"
flash[:alert] = "#{I18n.t('admin.tools.wordcloud_error')}: #{e.message}"
redirect_to admin_tools_wordcloud_path
end
# GET /admin/tools/wordcloud/progress
def progress
@progress_key = "wordcloud_progress:#{current_admin_user.id}"
@progress_data = Rails.cache.fetch(@progress_key) || { status: 'not_started', progress: 0 }
end
# GET /admin/tools/wordcloud/status
def status
progress_key = "wordcloud_progress:#{current_admin_user.id}"
progress_data = Rails.cache.fetch(progress_key) || { status: 'not_started', progress: 0 }
render json: progress_data
end
private
def ensure_wordcloud_dir
FileUtils.mkdir_p(WORDCLOUD_DIR) unless Dir.exist?(WORDCLOUD_DIR)
end
def process_uploaded_file(uploaded_file)
# Create a persistent copy of the uploaded file
persistent_file_path = Rails.root.join('tmp', "domains_#{Time.now.to_i}.csv")
# Copy the file content to a persistent location
FileUtils.cp(uploaded_file.tempfile.path, persistent_file_path)
# Validate file has content
if File.size(persistent_file_path).zero?
File.delete(persistent_file_path)
flash[:alert] = I18n.t('admin.tools.wordcloud_empty_file')
return nil
end
persistent_file_path
end
def build_config_from_params
# Base configuration
config = {
width: params[:width].presence || 800,
height: params[:height].presence || 800,
max_words: params[:max_words].presence || 500,
background_color: params[:background_color].presence || 'white',
min_word_length: params[:min_word_length].presence || 2,
include_numbers: params[:include_numbers] == '1',
batch_size: params[:batch_size].presence || 500
}
# Process additional stopwords
if params[:additional_stopwords].present?
stopwords = params[:additional_stopwords].downcase.split(/[\s,]+/).reject(&:empty?)
config[:additional_stopwords] = stopwords if stopwords.any?
end
# Process special terms
if params[:special_terms].present?
special_terms = params[:special_terms].split(/[\s,]+/).reject(&:empty?)
config[:special_terms] = special_terms if special_terms.any?
end
config
end
def load_wordcloud_config
if File.exist?(WORDCLOUD_CONFIG_PATH)
begin
JSON.parse(File.read(WORDCLOUD_CONFIG_PATH))
rescue JSON::ParserError
default_wordcloud_config
end
else
default_wordcloud_config
end
end
def setup_wordcloud_data
# Add timestamp to prevent caching
@wordcloud_url = "/wordcloud/wordcloud.png?t=#{File.mtime(WORDCLOUD_IMAGE_PATH).to_i}"
# Get the file's modification time and convert to application timezone
@wordcloud_generated_at = File.mtime(WORDCLOUD_IMAGE_PATH).in_time_zone(Time.zone)
# Load top words
load_top_words
end
def load_top_words
return unless File.exist?(TOP_WORDS_PATH)
@top_words = []
File.readlines(TOP_WORDS_PATH).each do |line|
if line =~ /^\d+\.\s+(\w+):\s+(\d+)$/
@top_words << [$1, $2.to_i]
end
end
end
def default_wordcloud_config
{
'width' => 800,
'height' => 800,
'max_words' => 500,
'background_color' => 'white',
'additional_stopwords' => [],
'include_numbers' => true,
'min_word_length' => 2,
'special_terms' => ['e-', 'i-', '2-', '3-', '4-', '.com', 'tr.ee', 'ai', 'web'],
'batch_size' => 500
}
end
def authorize_admin
authorize! :access, :tools
end
def clear_cache
Rails.cache.delete("wordcloud_progress:#{current_admin_user.id}")
end
end
end
end

View file

@ -0,0 +1,14 @@
module Admin
class ToolsController < BaseController
before_action :authorize_admin
# GET /admin/tools
def index; end
private
def authorize_admin
authorize! :access, :tools
end
end
end

View file

@ -0,0 +1,86 @@
# Use Open3 to capture output in real-time
require 'open3'
class GenerateWordCloudJob < ApplicationJob
def perform(domains_file_path, user_id, config = {})
# Set up progress tracking
progress_key = "wordcloud_progress:#{user_id}"
Rails.cache.write(progress_key, { status: 'processing', progress: 0 })
begin
# Ensure the wordcloud directory exists
wordcloud_dir = Rails.root.join('public', 'wordcloud')
FileUtils.mkdir_p(wordcloud_dir) unless Dir.exist?(wordcloud_dir)
# Setup Python environment
python_executable = ENV.fetch('PYTHON_EXECUTABLE', 'python3')
script_path = Rails.root.join('lib', 'wordcloud', 'generate_wordcloud.py')
# Create a config file for the Python script
config_file_path = Rails.root.join(wordcloud_dir, "wordcloud_config_#{Time.now.to_i}.json")
File.write(config_file_path, config.to_json)
# Set environment variables to ensure proper encoding
env = { 'PYTHONIOENCODING' => 'utf-8', 'PYTHONUNBUFFERED' => '1' }
# Debug information
# Rails.logger.info("Python executable: #{python_executable}")
# Rails.logger.info("Script path: #{script_path}")
# Rails.logger.info("Domains file: #{domains_file_path}")
# Rails.logger.info("Output directory: #{wordcloud_dir}")
# Check if files exist
# Rails.logger.info("Script exists: #{File.exist?(script_path)}")
# Rails.logger.info("Domains file exists: #{File.exist?(domains_file_path)}")
# Make script executable
FileUtils.chmod('+x', script_path) unless File.executable?(script_path)
Open3.popen2e(env, python_executable, script_path.to_s, domains_file_path, wordcloud_dir.to_s, config_file_path.to_s) do |stdin, stdout_err, wait_thr|
# Close stdin since we don't need it
stdin.close
# Process output line by line
while line = stdout_err.gets
# Parse progress from Python script output
if line =~ /Processing batch (\d+)\/(\d+)/
current = $1.to_i
total = $2.to_i
progress = ((current.to_f / total) * 80).round
Rails.cache.write(progress_key, { status: 'processing', progress: progress })
elsif line =~ /Total estimated cost/
# Update when word extraction is complete
Rails.cache.write(progress_key, { status: 'processing', progress: 80 })
elsif line =~ /Generating word cloud/
# Update when word cloud generation starts
Rails.cache.write(progress_key, { status: 'processing', progress: 90 })
end
# Log output for debugging
Rails.logger.info("WordCloud: #{line.strip}")
end
# Check if the process was successful
exit_status = wait_thr.value
if exit_status.success?
Rails.cache.write(progress_key, { status: 'completed', progress: 100 })
else
Rails.cache.write(progress_key, {
status: 'failed',
progress: 0,
error: "Process failed with status #{exit_status.exitstatus}"
})
end
end
rescue => e
Rails.logger.error("Error in WordCloud job: #{e.message}")
Rails.logger.error(e.backtrace.join("\n"))
Rails.cache.write(progress_key, { status: 'failed', progress: 0, error: e.message })
ensure
# Clean up the config file
File.delete(config_file_path) if File.exist?(config_file_path)
end
end
end

View file

@ -121,6 +121,7 @@ class Ability
can :destroy, :pending
can :create, :zonefile
can :access, :settings_menu
can :access, :tools
can :manage, :mass_actions
can :manage, BouncedMailAddress
end

View file

@ -6,6 +6,8 @@
%li= link_to t(:contacts), admin_contacts_path
- if can? :show, Registrar
%li= link_to t(:registrars), admin_registrars_path
- if can?(:access, :tools)
%li= link_to t(:tools), admin_tools_path
- if can?(:access, :settings_menu)
%li.dropdown
%a.dropdown-toggle{"data-toggle" => "dropdown", href: "#"}

View file

@ -0,0 +1,23 @@
<%= render "shared/title", name: t('admin.tools.title') %>
<div class="row">
<div class="col-md-12">
<div class="panel panel-default">
<div class="panel-heading">
<h3 class="panel-title"><%= t('admin.tools.available_tools') %></h3>
</div>
<div class="panel-body">
<div class="row">
<div class="col-md-4">
<div class="well well-sm">
<h4><%= t('admin.tools.wordcloud_generator') %></h4>
<p><%= t('admin.tools.wordcloud_generator_description') %></p>
<%= link_to t('admin.tools.generate_wordcloud'), admin_tools_wordcloud_path, class: 'btn btn-primary' %>
</div>
</div>
<!-- Additional tools can be added here in similar well blocks -->
</div>
</div>
</div>
</div>
</div>

View file

@ -0,0 +1,198 @@
<% content_for :actions do %>
<%= link_to t('back'), admin_tools_path, class: 'btn btn-default' %>
<% end %>
<%= render "shared/title", name: t('admin.tools.wordcloud_title') %>
<style>
.wordcloud-container {
margin-bottom: 20px;
}
.controls-section {
padding-top: 15px;
margin-top: 15px;
border-top: 1px solid #eee;
}
.instructions {
margin-bottom: 15px;
color: #555;
}
.mt-2 {
margin-top: 10px;
}
.wordcloud-container a {
display: block;
text-decoration: none;
padding: 5px;
border: 1px solid transparent;
transition: all 0.2s ease;
}
.wordcloud-container a:hover {
border-color: #ddd;
background-color: #f9f9f9;
border-radius: 4px;
}
.wordcloud-container a small {
color: #337ab7;
}
</style>
<div class="row">
<div class="col-md-12">
<div class="row">
<div class="col-md-8">
<div class="panel panel-default">
<div class="panel-heading">
<h3 class="panel-title"><%= t('admin.tools.wordcloud_title') %></h3>
</div>
<div class="panel-body text-center">
<% if @wordcloud_url %>
<div class="wordcloud-container">
<%= link_to @wordcloud_url, target: "_blank", title: t('admin.tools.view_full_size') do %>
<%= image_tag @wordcloud_url, class: 'img-responsive', alt: t('admin.tools.wordcloud_title') %>
<div class="text-center mt-2">
<small><i class="fa fa-search-plus"></i> <%= t('admin.tools.click_to_enlarge') %></small>
</div>
<% end %>
<% if @wordcloud_generated_at %>
<div class="text-muted mt-2">
<small><i class="fa fa-clock-o"></i> <%= t('admin.tools.generated_at', time: l(@wordcloud_generated_at, format: :long)) %></small>
</div>
<% end %>
</div>
<% end %>
<div class="instructions">
<p><%= t('admin.tools.wordcloud_instructions') %></p>
</div>
<%= form_tag admin_tools_wordcloud_path, method: :post, multipart: true do %>
<div class="row">
<div class="col-md-12">
<div class="form-group">
<div class="custom-file-upload">
<p class="text-muted"><%= t('admin.tools.custom_file_description') %></p>
<%= file_field_tag :domains_file, accept: '.csv', class: 'form-control' %>
</div>
</div>
</div>
</div>
<div class="panel panel-default mt-3">
<div class="panel-heading">
<h4 class="panel-title">
<a data-toggle="collapse" href="#advancedOptions">
<i class="fa fa-cog"></i> <%= t('admin.tools.advanced_options') %>
</a>
</h4>
</div>
<div id="advancedOptions" class="panel-collapse collapse">
<div class="panel-body">
<div class="row">
<div class="col-md-4">
<div class="form-group">
<%= label_tag :width, t('admin.tools.wordcloud_width') %>
<%= number_field_tag :width, @config['width'], min: 400, max: 2000, step: 100, class: 'form-control' %>
</div>
</div>
<div class="col-md-4">
<div class="form-group">
<%= label_tag :height, t('admin.tools.wordcloud_height') %>
<%= number_field_tag :height, @config['height'], min: 400, max: 2000, step: 100, class: 'form-control' %>
</div>
</div>
<div class="col-md-4">
<div class="form-group">
<%= label_tag :max_words, t('admin.tools.wordcloud_max_words') %>
<%= number_field_tag :max_words, @config['max_words'], min: 100, max: 1000, step: 50, class: 'form-control' %>
</div>
</div>
</div>
<div class="row">
<div class="col-md-6">
<div class="form-group">
<%= label_tag :batch_size, t('admin.tools.batch_size') %>
<%= number_field_tag :batch_size, @config['batch_size'], min: 100, max: 1000, step: 50, class: 'form-control' %>
<small class="text-muted"><%= t('admin.tools.batch_size_help') %></small>
</div>
</div>
<div class="col-md-6">
<div class="form-group">
<%= label_tag :background_color, t('admin.tools.wordcloud_background') %>
<%= select_tag :background_color,
options_for_select([
['White', 'white'],
['Black', 'black'],
['Transparent', 'transparent'],
['Light Gray', '#f0f0f0']
], @config['background_color']),
class: 'form-control' %>
</div>
</div>
</div>
<div class="row">
<div class="col-md-6">
<div class="form-group">
<%= label_tag :min_word_length, t('admin.tools.min_word_length') %>
<%= number_field_tag :min_word_length, @config['min_word_length'], min: 1, max: 5, class: 'form-control' %>
</div>
</div>
<div class="col-md-6">
<div class="form-group">
<div class="checkbox" style="margin-top: 30px;">
<label>
<%= check_box_tag :include_numbers, '1', @config['include_numbers'] %>
<%= t('admin.tools.include_numbers') %>
</label>
</div>
</div>
</div>
</div>
<div class="form-group">
<%= label_tag :special_terms, t('admin.tools.special_terms') %>
<%= text_field_tag :special_terms, @config['special_terms'].is_a?(Array) ? @config['special_terms'].join(', ') : '',
class: 'form-control',
placeholder: t('admin.tools.special_terms_placeholder') %>
<small class="text-muted"><%= t('admin.tools.special_terms_help') %></small>
</div>
<div class="form-group">
<%= label_tag :additional_stopwords, t('admin.tools.additional_stopwords') %>
<%= text_area_tag :additional_stopwords, @config['additional_stopwords'].is_a?(Array) ? @config['additional_stopwords'].join(', ') : '',
rows: 3,
placeholder: t('admin.tools.stopwords_placeholder'),
class: 'form-control' %>
<small class="text-muted"><%= t('admin.tools.stopwords_help') %></small>
</div>
</div>
</div>
</div>
<%= submit_tag t('admin.tools.generate_wordcloud'), class: 'btn btn-primary btn-lg mt-3' %>
<% end %>
</div>
</div>
</div>
<div class="col-md-4">
<div class="panel panel-default">
<div class="panel-heading">
<h3 class="panel-title"><%= t('admin.tools.top_words') %></h3>
</div>
<div class="panel-body">
<% if @top_words && @top_words.any? %>
<ol>
<% @top_words.each do |word, count| %>
<li><strong><%= word %></strong>: <%= count %></li>
<% end %>
</ol>
<% else %>
<p class="text-muted"><%= t('admin.tools.top_words_empty') %></p>
<% end %>
</div>
</div>
</div>
</div>
</div>
</div>

View file

@ -0,0 +1,93 @@
<div class="container-fluid">
<div class="row">
<div class="col-12">
<h1>WordCloud Generation Progress</h1>
<div class="card mb-4">
<div class="card-body">
<div class="progress mb-3">
<div id="progress-bar" class="progress-bar" role="progressbar" style="width: <%= @progress_data[:progress] %>%;"
aria-valuenow="<%= @progress_data[:progress] %>" aria-valuemin="0" aria-valuemax="100">
<%= @progress_data[:progress] %>%
</div>
</div>
<div id="status-message">
<% case @progress_data[:status] %>
<% when 'not_started' %>
<div class="alert alert-info">Waiting to start processing...</div>
<% when 'processing' %>
<div class="alert alert-info">Processing in progress...</div>
<% when 'completed' %>
<div class="alert alert-success">
WordCloud generation completed!
<%= link_to "View WordCloud", admin_tools_wordcloud_path, class: "btn btn-primary" %>
</div>
<% when 'failed' %>
<div class="alert alert-danger">
Error: <%= @progress_data[:error] || "Unknown error occurred" %>
</div>
<% end %>
</div>
<div class="mt-3">
<%= link_to "Back to Tools", admin_tools_path, class: "btn btn-secondary" %>
</div>
</div>
</div>
</div>
</div>
</div>
<script>
document.addEventListener('DOMContentLoaded', function() {
// Only poll if the process is not completed or failed
if ("<%= @progress_data[:status] %>" !== "completed" && "<%= @progress_data[:status] %>" !== "failed") {
pollProgress();
}
});
function pollProgress() {
const progressBar = document.getElementById('progress-bar');
const statusMessage = document.getElementById('status-message');
// Poll every 2 seconds
setInterval(function() {
fetch('<%= status_admin_tools_wordcloud_path %>')
.then(response => response.json())
.then(data => {
// Update progress bar
progressBar.style.width = data.progress + '%';
progressBar.setAttribute('aria-valuenow', data.progress);
progressBar.textContent = data.progress + '%';
// Update status message
let statusHtml = '';
switch(data.status) {
case 'not_started':
statusHtml = '<div class="alert alert-info">Waiting to start processing...</div>';
break;
case 'processing':
statusHtml = '<div class="alert alert-info">Processing in progress...</div>';
break;
case 'completed':
statusHtml = '<div class="alert alert-success">WordCloud generation completed! ' +
'<a href="<%= admin_tools_wordcloud_path %>">View WordCloud</a></div>';
// Redirect after a short delay
setTimeout(function() {
window.location.href = '<%= admin_tools_wordcloud_path %>';
}, 2000);
break;
case 'failed':
statusHtml = '<div class="alert alert-danger">Error: ' +
(data.error || "Unknown error occurred") + '</div>';
break;
}
statusMessage.innerHTML = statusHtml;
})
.catch(error => {
console.error('Error polling progress:', error);
});
}, 2000);
}
</script>

View file

@ -54,6 +54,10 @@ Rails.application.configure do
# Use a different cache store in production.
# config.cache_store = :mem_cache_store
config.cache_store = :redis_cache_store, {
url: "#{ENV.fetch('REDIS_URL', 'redis://localhost:6379')}/1',
expires_in: 300.seconds
}
# Use a real queuing backend for Active Job (and separate queues per environment)
config.active_job.queue_adapter = :sidekiq

View file

@ -2,7 +2,7 @@ require 'sidekiq/web' # Require at the top of the initializer
Sidekiq.configure_server do |config|
config.logger.level = Logger::INFO
# Custom job logging format
Sidekiq.logger.formatter = proc do |severity, datetime, progname, msg|
thread_id = Thread.current.object_id.to_s(36)

View file

@ -0,0 +1,40 @@
en:
admin:
tools:
title: "Administrative Tools"
available_tools: "Available Tools"
wordcloud_generator: "Domain Name Wordcloud Generator"
wordcloud_generator_description: "Generate a visual wordcloud from domain names in the registry"
generate_wordcloud: "Generate Wordcloud"
regenerate_wordcloud: "Regenerate Wordcloud"
regenerate_description: "Click below to regenerate the wordcloud with the latest domain data"
wordcloud_title: "Domain Name Wordcloud"
wordcloud_success: "Wordcloud generated successfully"
wordcloud_error: "Error generating wordcloud"
wordcloud_processing: "Processing domain names. This may take a few minutes..."
wordcloud_instructions: "Generate a visual representation of the most common words in domain names. Click the button below to create the wordcloud."
top_words: "Top Words"
top_words_empty: "Generate a wordcloud to see the most frequent words."
click_to_enlarge: "Click to enlarge"
view_full_size: "View full size wordcloud image"
use_custom_domains: "Use custom domain list"
custom_file_description: "Upload a CSV file with one domain name per line"
file_upload_error: "Error processing uploaded file"
file_optional: "If no file is uploaded, all active domains in the registry will be used"
generated_at: "Generated at %{time}"
wordcloud_no_file: "No domain names found"
wordcloud_width: "Width"
wordcloud_height: "Height"
wordcloud_max_words: "Max Words"
wordcloud_background: "Background"
additional_stopwords: "Stopwords"
stopwords_placeholder: "Enter additional stopwords, one per line"
stopwords_help: "Stopwords are words that will not be included in the wordcloud"
advanced_options: "Advanced Options"
min_word_length: "Min Word Length"
include_numbers: "Include Numbers"
special_terms: "Special Terms"
special_terms_placeholder: "e.g., e-, i-, .com, ai, web"
special_terms_help: "These terms will be preserved in the word cloud even if they would normally be filtered out"
batch_size: "Batch Size"
batch_size_help: "Number of domains to process in each API call."

View file

@ -250,6 +250,17 @@ Rails.application.routes.draw do
end
# post 'admi/upload_spreadsheet', to: 'customers#upload_spreadsheet', as: :customers_upload_spreadsheet
resources :tools, only: %i[index]
namespace :tools do
resource :wordcloud, controller: 'wordcloud', only: %i[create] do
collection do
get '', to: 'wordcloud#index'
get 'progress', to: 'wordcloud#progress', as: :progress
get 'status', to: 'wordcloud#status', as: :status
end
end
end
resources :bank_statements do
resources :bank_transactions

Binary file not shown.

View file

@ -0,0 +1,268 @@
#!/usr/bin/env python3
import os
import re
import sys
import json
import random
import numpy as np
from PIL import Image
from os import path
from wordcloud import WordCloud, STOPWORDS
import openai
import matplotlib.pyplot as plt
from dotenv import load_dotenv
load_dotenv()
d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
output_dir = sys.argv[2] if len(sys.argv) > 2 else d
# Load configuration if provided
config = {}
if len(sys.argv) > 3 and sys.argv[3]:
config_file = sys.argv[3]
if path.exists(config_file):
with open(config_file, 'r') as f:
config = json.load(f)
print(f"Loaded configuration: {config}")
# Check if domains file path is provided and exists
if len(sys.argv) > 1 and sys.argv[1]:
domains_file = sys.argv[1]
if not path.exists(domains_file):
print(f"Error: Provided domains file {domains_file} not found")
sys.exit(1)
else:
print(f"Error: Domains file not found")
sys.exit(1)
# Read domain names from the file
with open(domains_file, 'r', encoding='utf-8') as f:
domain_names = [line.strip().lower() for line in f if line.strip()]
if not domain_names:
print("Error: No domain names found in the provided file")
sys.exit(1)
# Get special terms from config or use defaults
SPECIAL_TERMS = config.get('special_terms', ['e-', 'i-', '2-', '3-', '4-', '.com', 'tr.ee', 'ai', 'web'])
print(f"Using special terms: {SPECIAL_TERMS}")
# Get batch size from config or use default
BATCH_SIZE = int(config.get('batch_size', 500))
print(f"Using batch size: {BATCH_SIZE}")
# Function to extract words using OpenAI API
def extract_words_with_openai(domain_names, special_terms, batch_size=BATCH_SIZE):
# Get API key from environment variable
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
raise ValueError("OpenAI API key not found. Set the OPENAI_API_KEY environment variable.")
# Get model and temperature from environment variables
model = os.environ.get("OPENAI_MODEL", "gpt-4.1-2025-04-14")
temperature = float(os.environ.get("OPENAI_TEMPERATURE", "0.3"))
max_tokens = int(os.environ.get("OPENAI_MAX_TOKENS", "2000"))
# Process domains in batches
all_words = []
total_prompt_tokens = 0
total_completion_tokens = 0
total_cost = 0
# Calculate number of batches
num_batches = (len(domain_names) + batch_size - 1) // batch_size
for i in range(0, len(domain_names), batch_size):
batch = domain_names[i:i+batch_size]
print(f"Processing batch {i//batch_size + 1}/{num_batches} ({len(batch)} domains)...")
sys.stdout.flush()
# Prepare the prompt with domain names and special terms
domains_text = "\n".join(batch)
special_terms_text = ", ".join([f"`{term}`" for term in special_terms])
prompt = f"""You are a bilingual Estonian-English linguist and word segmentation expert. I will give you a list of .ee domain names.
Your task is to extract a clean list of words for word cloud generation.
Follow these rules strictly:
1. Before doing anything else, always extract and separate these predefined special terms if they appear as prefixes or parts of the domain name: {special_terms_text}. Keep symbols and numbers as they are. For example, if the domain name is `e-robot.ee`, the output should be `e- robot`. Remove extensions from the special terms.
2. If a word contains a number (e.g., `auto24`), separate the number and the word: `auto`, `24`.
3. If the domain name is a compound of 2+ Estonian or English words (e.g., `virtuaalabiline` or `doorkeeper`), intelligently split them into individual meaningful components. Prioritize Estonian words over English words.
4. Keep all resulting words in lowercase and remove the `.ee` extension from all the words
5. Try to find the most common words and phrases in the domain names.
6. Return ONLY a space-separated list of words and numberswith no explanations, no formatting, no introductions, and no additional text.
Example output format:
word1 word2 word3 word4 word5
Here are the domain names:
{domains_text}
"""
# Make the API call
try:
print(f"Using model: {model} with temperature: {temperature}")
response = openai.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "You are a helpful assistant that extracts words from domain names. You ONLY output the extracted words with no additional text."},
{"role": "user", "content": prompt}
],
temperature=temperature,
max_tokens=max_tokens
)
# Track token usage
prompt_tokens = response.usage.prompt_tokens
completion_tokens = response.usage.completion_tokens
total_tokens = response.usage.total_tokens
total_prompt_tokens += prompt_tokens
total_completion_tokens += completion_tokens
print(f"Token usage - Prompt: {prompt_tokens}, Completion: {completion_tokens}, Total: {total_tokens}")
# Calculate cost (approximate, based on current pricing)
if "gpt-4.1" in model:
prompt_cost = (prompt_tokens / 1000000) * 2.00 # $2.00 per 1M tokens for GPT-4.1 input
completion_cost = (completion_tokens / 1000000) * 8.00 # $8.00 per 1M tokens for GPT-4.1 output
else:
prompt_cost = 0
completion_cost = 0
batch_cost = prompt_cost + completion_cost
total_cost += batch_cost
print(f"Estimated batch cost: ${batch_cost:.6f}")
# Extract the words from the response
words_text = response.choices[0].message.content.strip()
# Process the response to get a clean list of words
batch_words = []
for line in words_text.split('\n'):
line = line.strip()
if line and not line.startswith('```') and not line.endswith('```'):
# Remove any list markers like "1. ", "- ", etc.
cleaned_line = re.sub(r'^[\d\-\*\\.\s]+', '', line)
if cleaned_line:
batch_words.extend(cleaned_line.split())
all_words.extend(batch_words)
print(f"Extracted {len(batch_words)} words from this batch")
except Exception as e:
print(f"Error calling OpenAI API for batch: {e}")
print(f"Total token usage - Prompt: {total_prompt_tokens}, Completion: {total_completion_tokens}")
print(f"Total estimated cost: ${total_cost:.6f}")
return all_words
# Process domain names using OpenAI
print("Extracting words from domain names using OpenAI...")
extracted_words = extract_words_with_openai(domain_names, SPECIAL_TERMS)
print(f"Extracted {len(extracted_words)} words")
# print("Sample of extracted words:", extracted_words)
# Join the extracted words for the word cloud
processed_text = ' '.join(extracted_words)
# print("Processed text sample:", processed_text)
def custom_color_func(word, font_size, position, orientation, random_state=None,
**kwargs):
return "hsl(215, 100%%, %d%%)" % random.randint(15, 80)
mask = np.array(Image.open(path.join(d, 'mask.png')))
# Get configuration values with defaults
width = int(config.get('width', 800))
height = int(config.get('height', 800))
max_words = int(config.get('max_words', 500))
background_color = config.get('background_color', 'white')
min_word_length = int(config.get('min_word_length', 2))
include_numbers = config.get('include_numbers', True)
# Handle transparent background
if background_color == 'transparent':
background_color = None
# Get additional stopwords
additional_stopwords = config.get('additional_stopwords', [])
stopwords = set(STOPWORDS)
stopwords = {
'ja', 'ning', 'et', 'kui', 'aga', 'ka', 'ei', 'see', 'on', 'ole',
'oma', 'seda', 'siis', 'või', 'mis', 'nii', 'veel', 'kes', 'üle',
'välja', 'olema', 'kus', 'nagu', 'kuid', 'selle', 'pole', 'ära',
'vaid', 'sest', 'juba', 'meie', 'mida', 'need', 'olid', 'minu',
'tema', 'pärast', 'mingi', 'palju', 'kõik', 'seal', 'olen', 'oled',
'oli', 'olnud', 'ongi', 'poolt', 'meil', 'teda', 'just', 'kuna',
'läbi', 'küll',
'the', 'and', 'a', 'to', 'of', 'in', 'is', 'that', 'it', 'for',
'with', 'as', 'be', 'on', 'not', 'this', 'but', 'by', 'from', 'are',
'or', 'an', 'at', 'was', 'have', 'has', 'had', 'were', 'will', 'would',
'should', 'can', 'could', 'may', 'might', 'must', 'do', 'does', 'did',
'doing', 'done', 'their', 'they', 'them', 'there', 'these', 'those',
'which', 'who', 'whom', 'whose', 'what', 'when', 'where', 'why', 'how'
}
stopwords.update(stopwords)
stopwords.update(additional_stopwords)
font_path = path.join(d, 'fonts', 'Pacifico-Regular.ttf')
# Alternative: use a system font
# font_path = fm.findfont(fm.FontProperties(family='Arial'))
print("Generating word cloud...")
wc = WordCloud(width=width, height=height,
mask=mask,
stopwords=stopwords,
background_color=background_color,
max_words=max_words,
include_numbers=include_numbers,
collocations=False,
min_word_length=min_word_length,
regexp=r"[A-Za-zÕÄÖÜõäöü0-9][\w\-'ÕÄÖÜõäöü]*(?<!\.ee)(?<!ee)",
font_path=font_path)
wc.generate(processed_text)
# Get word frequencies from the word cloud
word_frequencies = wc.process_text(processed_text)
# Remove stopwords from the frequencies
word_frequencies = {word: freq for word, freq in word_frequencies.items()
if word.lower() not in stopwords}
# Sort words by frequency (highest first)
sorted_words = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)
# Get top 10 words
top_10_words = sorted_words[:10]
# Print top 10 words to console
print("\nTop 10 most frequent words:")
for word, freq in top_10_words:
print(f"{word}: {freq}")
# Save top 10 words to a text file
top_words_file = path.join(output_dir, 'top_words.txt')
with open(top_words_file, 'w', encoding='utf-8') as f:
f.write("Top 10 most frequent words:\n")
for i, (word, freq) in enumerate(top_10_words, 1):
f.write(f"{i}. {word}: {freq}\n")
print(f"\nTop words saved to {top_words_file}")
# store default colored image
default_colors = wc.to_array()
# Display the word cloud
plt.imshow(wc.recolor(color_func=custom_color_func, random_state=3),
interpolation="bilinear")
plt.axis('off')
plt.show()
# Save the word cloud to file
wc.to_file(path.join(output_dir, 'wordcloud.png'))

BIN
lib/wordcloud/mask.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB

View file

@ -0,0 +1 @@
{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","special_terms":["e-","i-","2-","3-","4-",".com","ai","web"]}

View file

@ -0,0 +1,11 @@
Top 10 most frequent words:
1. tr: 4
2. auto: 4
3. 2-: 4
4. faktor: 4
5. e-: 2
6. i-: 2
7. digi: 2
8. car: 2
9. ai: 1
10. robot: 1

Binary file not shown.

After

Width:  |  Height:  |  Size: 383 KiB