This commit is contained in:
Sergey Tsyganov 2025-08-10 02:07:10 +00:00 committed by GitHub
commit 42b6e792b3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
28 changed files with 1163 additions and 2 deletions

View file

@ -62,6 +62,16 @@ RUN apt-get install -y --no-install-recommends > /dev/null \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Install Python packages for wordcloud generation
RUN apt-get update && apt-get install -y --no-install-recommends \
python3-pip \
python3-setuptools \
python3-dev \
&& pip3 install --upgrade pip setuptools wheel \
&& pip3 install --no-cache-dir numpy Pillow matplotlib wordcloud openai dotenv \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
RUN apt-get autoremove -y && apt-get clean
ENV CHROME_VERSION="128.0.6613.137"
@ -95,7 +105,6 @@ ENV PATH="/opt/chrome-linux64:${PATH}"
RUN ln -s /lib/ld-linux.so.2 /lib/ld-linux.so.2 || true
# Обертка для wkhtmltopdf с xvfb
RUN echo '#!/bin/bash\nxvfb-run -a --server-args="-screen 0, 1024x768x24" /usr/bin/wkhtmltopdf "$@"' > /usr/local/bin/wkhtmltopdf \
&& chmod +x /usr/local/bin/wkhtmltopdf

View file

@ -113,3 +113,6 @@ gem 'net-ftp'
# https://stackoverflow.com/questions/79360526/uninitialized-constant-activesupportloggerthreadsafelevellogger-nameerror
gem 'concurrent-ruby', '1.3.4'
# gives you access to stdin, stdout, and stderr when running other programs
gem 'open3'

View file

@ -394,6 +394,7 @@ GEM
omniauth-rails_csrf_protection (0.1.2)
actionpack (>= 4.2)
omniauth (>= 1.3.1)
open3 (0.2.1)
openid_connect (1.4.2)
activemodel
attr_required (>= 1.0.0)
@ -636,6 +637,7 @@ DEPENDENCIES
nokogiri (~> 1.16.0)
omniauth-rails_csrf_protection
omniauth-tara!
open3
openssl
paper_trail (~> 14.0)
pdfkit

View file

@ -0,0 +1,175 @@
# frozen_string_literal: true
module Admin
module Tools
# Controller for the admin wordcloud generator tool that creates visual representations
# of the most common words used in domain names
class WordcloudController < BaseController # rubocop:disable Metrics/ClassLength
WORDCLOUD_DIR = Rails.root.join('public', 'wordcloud')
WORDCLOUD_IMAGE_PATH = WORDCLOUD_DIR.join('wordcloud.png')
WORDCLOUD_CONFIG_PATH = WORDCLOUD_DIR.join('config.json')
TOP_WORDS_PATH = WORDCLOUD_DIR.join('top_words.txt')
before_action :authorize_admin
before_action :clear_cache, only: :create
before_action :ensure_wordcloud_dir, only: :create
def index
# Load configuration
@config = load_wordcloud_config
# Setup wordcloud data if image exists
if File.exist?(WORDCLOUD_IMAGE_PATH)
setup_wordcloud_data
else
@wordcloud_url = nil
end
end
def create
# Validate domains file
if params[:domains_file].present?
domains_file_path = process_uploaded_file(params[:domains_file])
return redirect_to admin_tools_wordcloud_path if domains_file_path.nil?
else
flash[:alert] = I18n.t('admin.tools.wordcloud_no_file')
return redirect_to admin_tools_wordcloud_path
end
# Collect and save configuration
config = build_config_from_params
File.write(WORDCLOUD_CONFIG_PATH, config.to_json)
# Start the background job
GenerateWordCloudJob.perform_later(domains_file_path.to_s, current_admin_user.id, config)
redirect_to progress_admin_tools_wordcloud_path
rescue StandardError => e
logger.error "Error starting wordcloud generation: #{e.message}"
flash[:alert] = "#{I18n.t('admin.tools.wordcloud_error')}: #{e.message}"
redirect_to admin_tools_wordcloud_path
end
# GET /admin/tools/wordcloud/progress
def progress
@progress_key = "wordcloud_progress:#{current_admin_user.id}"
@progress_data = Rails.cache.fetch(@progress_key) || { status: 'not_started', progress: 0 }
end
# GET /admin/tools/wordcloud/status
def status
progress_key = "wordcloud_progress:#{current_admin_user.id}"
progress_data = Rails.cache.fetch(progress_key) || { status: 'not_started', progress: 0 }
render json: progress_data
end
private
def ensure_wordcloud_dir
FileUtils.mkdir_p(WORDCLOUD_DIR) unless Dir.exist?(WORDCLOUD_DIR)
end
def process_uploaded_file(uploaded_file)
# Create a persistent copy of the uploaded file
persistent_file_path = Rails.root.join('tmp', "domains_#{Time.now.to_i}.csv")
# Copy the file content to a persistent location
FileUtils.cp(uploaded_file.tempfile.path, persistent_file_path)
# Validate file has content
if File.size(persistent_file_path).zero?
File.delete(persistent_file_path)
flash[:alert] = I18n.t('admin.tools.wordcloud_empty_file')
return nil
end
persistent_file_path
end
def build_config_from_params
# Base configuration
config = {
width: params[:width].presence || 800,
height: params[:height].presence || 800,
max_words: params[:max_words].presence || 500,
background_color: params[:background_color].presence || 'white',
min_word_length: params[:min_word_length].presence || 2,
include_numbers: params[:include_numbers] == '1',
batch_size: params[:batch_size].presence || 500,
additional_prompt: params[:additional_prompt].presence || nil
}
# Process additional stopwords
if params[:additional_stopwords].present?
stopwords = params[:additional_stopwords].downcase.split(/[\s,]+/).reject(&:empty?)
config[:additional_stopwords] = stopwords if stopwords.any?
end
# Process special terms
if params[:special_terms].present?
special_terms = params[:special_terms].split(/[\s,]+/).reject(&:empty?)
config[:special_terms] = special_terms if special_terms.any?
end
config
end
def load_wordcloud_config
if File.exist?(WORDCLOUD_CONFIG_PATH)
begin
JSON.parse(File.read(WORDCLOUD_CONFIG_PATH))
rescue JSON::ParserError
default_wordcloud_config
end
else
default_wordcloud_config
end
end
def setup_wordcloud_data
# Add timestamp to prevent caching
@wordcloud_url = "/wordcloud/wordcloud.png?t=#{File.mtime(WORDCLOUD_IMAGE_PATH).to_i}"
# Get the file's modification time and convert to application timezone
@wordcloud_generated_at = File.mtime(WORDCLOUD_IMAGE_PATH).in_time_zone(Time.zone)
# Load top words
load_top_words
end
def load_top_words
return unless File.exist?(TOP_WORDS_PATH)
@top_words = []
File.readlines(TOP_WORDS_PATH).each do |line|
if line =~ /^\d+\.\s+(\w+):\s+(\d+)$/
@top_words << [$1, $2.to_i]
end
end
end
def default_wordcloud_config
{
'width' => 800,
'height' => 800,
'max_words' => 500,
'background_color' => 'white',
'additional_stopwords' => [],
'include_numbers' => true,
'min_word_length' => 2,
'special_terms' => ['e-', 'i-', '2-', '3-', '4-', '.com', 'tr.ee', 'ai', 'web'],
'batch_size' => 500
}
end
def authorize_admin
authorize! :access, :tools
end
def clear_cache
Rails.cache.delete("wordcloud_progress:#{current_admin_user.id}")
end
end
end
end

View file

@ -0,0 +1,14 @@
module Admin
class ToolsController < BaseController
before_action :authorize_admin
# GET /admin/tools
def index; end
private
def authorize_admin
authorize! :access, :tools
end
end
end

View file

@ -0,0 +1,119 @@
# Use Open3 to capture output in real-time
require 'open3'
# Background job that generates a wordcloud image from domain names
# using an external Python script with progress tracking
class GenerateWordCloudJob < ApplicationJob
def perform(domains_file_path, user_id, config = {})
Rails.logger.info("Generating wordcloud for #{domains_file_path}")
@domains_file_path = domains_file_path
@user_id = user_id
@config = config
@progress_key = "wordcloud_progress:#{user_id}"
@wordcloud_dir = Rails.root.join('public', 'wordcloud')
@config_file_path = nil
initialize_progress
begin
setup_environment
run_wordcloud_script
rescue StandardError => e
handle_error(e)
ensure
cleanup
end
end
private
def initialize_progress
Rails.cache.write(@progress_key, { status: 'processing', progress: 0 })
end
def setup_environment
# Ensure the wordcloud directory exists
FileUtils.mkdir_p(@wordcloud_dir) unless Dir.exist?(@wordcloud_dir)
# Create a config file for the Python script
@config_file_path = Rails.root.join(@wordcloud_dir, "wordcloud_config_#{Time.now.to_i}.json")
File.write(@config_file_path, @config.to_json)
# Setup Python script
@script_path = Rails.root.join('lib', 'wordcloud', 'generate_wordcloud.py')
FileUtils.chmod('+x', @script_path) unless File.executable?(@script_path)
end
def run_wordcloud_script
python_executable = ENV.fetch('PYTHON_EXECUTABLE', 'python3')
env = { 'PYTHONIOENCODING' => 'utf-8', 'PYTHONUNBUFFERED' => '1' }
Open3.popen2e(env, python_executable, @script_path.to_s, @domains_file_path,
@wordcloud_dir.to_s, @config_file_path.to_s) do |stdin, stdout_err, wait_thr|
stdin.close
process_script_output(stdout_err, wait_thr)
end
end
def process_script_output(stdout_err, wait_thr)
# Process output line by line
while line = stdout_err.gets
update_progress_from_output(line)
Rails.logger.info("WordCloud: #{line.strip}")
end
# Process exit status
handle_exit_status(wait_thr.value)
end
def update_progress_from_output(line)
case line
when %r{Processing batch (\d+)/(\d+)}
current, total = $1.to_i, $2.to_i
progress = ((current.to_f / total) * 80).round
update_progress(progress)
when /Total estimated cost/
update_progress(80)
when /Generating word cloud/
update_progress(90)
end
end
def update_progress(value, status: 'processing')
Rails.cache.write(@progress_key, { status: status, progress: value })
end
def handle_exit_status(exit_status)
if exit_status.success?
update_progress(100, status: 'completed')
else
Rails.cache.write(
@progress_key,
{
status: 'failed',
progress: 0,
error: "Process failed with status #{exit_status.exitstatus}"
}
)
end
end
def handle_error(exception)
Rails.logger.error("Error in WordCloud job: #{exception.message}")
Rails.logger.error(exception.backtrace.join("\n"))
Rails.cache.write(
@progress_key,
{
status: 'failed',
progress: 0,
error: exception.message
}
)
end
def cleanup
File.delete(@config_file_path) if @config_file_path && File.exist?(@config_file_path)
end
end

View file

@ -121,6 +121,7 @@ class Ability
can :destroy, :pending
can :create, :zonefile
can :access, :settings_menu
can :access, :tools
can :manage, :mass_actions
can :manage, BouncedMailAddress
end

View file

@ -6,6 +6,8 @@
%li= link_to t(:contacts), admin_contacts_path
- if can? :show, Registrar
%li= link_to t(:registrars), admin_registrars_path
- if can?(:access, :tools)
%li= link_to t(:tools), admin_tools_path
- if can?(:access, :settings_menu)
%li.dropdown
%a.dropdown-toggle{"data-toggle" => "dropdown", href: "#"}

View file

@ -0,0 +1,23 @@
<%= render "shared/title", name: t('admin.tools.title') %>
<div class="row">
<div class="col-md-12">
<div class="panel panel-default">
<div class="panel-heading">
<h3 class="panel-title"><%= t('admin.tools.available_tools') %></h3>
</div>
<div class="panel-body">
<div class="row">
<div class="col-md-4">
<div class="well well-sm">
<h4><%= t('admin.tools.wordcloud_generator') %></h4>
<p><%= t('admin.tools.wordcloud_generator_description') %></p>
<%= link_to t('admin.tools.generate_wordcloud'), admin_tools_wordcloud_path, class: 'btn btn-primary' %>
</div>
</div>
<!-- Additional tools can be added here in similar well blocks -->
</div>
</div>
</div>
</div>
</div>

View file

@ -0,0 +1,114 @@
<%= form_tag admin_tools_wordcloud_path, method: :post, multipart: true do %>
<div class="row">
<div class="col-md-12">
<div class="form-group">
<div class="custom-file-upload">
<p class="text-muted"><%= t('admin.tools.wordcloud.custom_file_description') %></p>
<%= file_field_tag :domains_file, accept: '.csv', class: 'form-control' %>
</div>
</div>
</div>
</div>
<div class="panel panel-default mt-3">
<div class="panel-heading">
<h4 class="panel-title">
<a data-toggle="collapse" href="#advancedOptions">
<i class="fa fa-cog"></i> <%= t('admin.tools.wordcloud.advanced_options') %>
</a>
</h4>
</div>
<div id="advancedOptions" class="panel-collapse collapse">
<div class="panel-body">
<div class="row">
<div class="col-md-4">
<div class="form-group">
<%= label_tag :width, t('admin.tools.wordcloud.width') %>
<%= number_field_tag :width, @config['width'], min: 400, max: 2000, step: 100, class: 'form-control' %>
</div>
</div>
<div class="col-md-4">
<div class="form-group">
<%= label_tag :height, t('admin.tools.wordcloud.height') %>
<%= number_field_tag :height, @config['height'], min: 400, max: 2000, step: 100, class: 'form-control' %>
</div>
</div>
<div class="col-md-4">
<div class="form-group">
<%= label_tag :max_words, t('admin.tools.wordcloud.max_words') %>
<%= number_field_tag :max_words, @config['max_words'], min: 100, max: 1000, step: 50, class: 'form-control' %>
</div>
</div>
</div>
<div class="row">
<div class="col-md-6">
<div class="form-group">
<%= label_tag :batch_size, t('admin.tools.wordcloud.batch_size') %>
<%= number_field_tag :batch_size, @config['batch_size'], min: 100, max: 1000, step: 50, class: 'form-control' %>
<small class="text-muted"><%= t('admin.tools.wordcloud.batch_size_help') %></small>
</div>
</div>
<div class="col-md-6">
<div class="form-group">
<%= label_tag :background_color, t('admin.tools.wordcloud.background') %>
<%= select_tag :background_color,
options_for_select([
['White', 'white'],
['Black', 'black'],
['Transparent', 'transparent'],
['Light Gray', '#f0f0f0']
], @config['background_color']),
class: 'form-control' %>
</div>
</div>
</div>
<div class="row">
<div class="col-md-6">
<div class="form-group">
<%= label_tag :min_word_length, t('admin.tools.wordcloud.min_word_length') %>
<%= number_field_tag :min_word_length, @config['min_word_length'], min: 1, max: 5, class: 'form-control' %>
</div>
</div>
<div class="col-md-6">
<div class="form-group">
<div class="checkbox" style="margin-top: 30px;">
<label>
<%= check_box_tag :include_numbers, '1', @config['include_numbers'] %>
<%= t('admin.tools.wordcloud.include_numbers') %>
</label>
</div>
</div>
</div>
</div>
<div class="form-group">
<%= label_tag :special_terms, t('admin.tools.wordcloud.special_terms') %>
<%= text_field_tag :special_terms, @config['special_terms'].is_a?(Array) ? @config['special_terms'].join(', ') : '',
class: 'form-control',
placeholder: t('admin.tools.wordcloud.special_terms_placeholder') %>
<small class="text-muted"><%= t('admin.tools.wordcloud.special_terms_help') %></small>
</div>
<div class="form-group">
<%= label_tag :additional_stopwords, t('admin.tools.wordcloud.additional_stopwords') %>
<%= text_area_tag :additional_stopwords, @config['additional_stopwords'].is_a?(Array) ? @config['additional_stopwords'].join(', ') : '',
rows: 3,
placeholder: t('admin.tools.wordcloud.stopwords_placeholder'),
class: 'form-control' %>
<small class="text-muted"><%= t('admin.tools.wordcloud.stopwords_help') %></small>
</div>
<div class="form-group">
<%= label_tag :additional_prompt, t('admin.tools.wordcloud.additional_prompt') %>
<%= text_area_tag :additional_prompt, @config['additional_prompt'], class: "form-control", rows: 3,
placeholder: t('admin.tools.wordcloud.additional_prompt_placeholder') %>
<small class="form-text text-muted"><%= t('admin.tools.wordcloud.additional_prompt_help') %></small>
</div>
</div>
</div>
</div>
<%= submit_tag t('admin.tools.generate_wordcloud'), class: 'btn btn-primary btn-lg mt-3' %>
<% end %>

View file

@ -0,0 +1,92 @@
<% content_for :actions do %>
<%= link_to t('back'), admin_tools_path, class: 'btn btn-default' %>
<% end %>
<%= render "shared/title", name: t('admin.tools.wordcloud.title') %>
<style>
.wordcloud-container {
margin-bottom: 20px;
}
.controls-section {
padding-top: 15px;
margin-top: 15px;
border-top: 1px solid #eee;
}
.instructions {
margin-bottom: 15px;
color: #555;
}
.mt-2 {
margin-top: 10px;
}
.wordcloud-container a {
display: block;
text-decoration: none;
padding: 5px;
border: 1px solid transparent;
transition: all 0.2s ease;
}
.wordcloud-container a:hover {
border-color: #ddd;
background-color: #f9f9f9;
border-radius: 4px;
}
.wordcloud-container a small {
color: #337ab7;
}
</style>
<div class="row">
<div class="col-md-12">
<div class="row">
<div class="col-md-8">
<div class="panel panel-default">
<div class="panel-heading">
<h3 class="panel-title"><%= t('admin.tools.wordcloud.title') %></h3>
</div>
<div class="panel-body text-center">
<% if @wordcloud_url %>
<div class="wordcloud-container">
<%= link_to @wordcloud_url, target: "_blank", title: t('admin.tools.wordcloud.view_full_size') do %>
<%= image_tag @wordcloud_url, class: 'img-responsive', alt: t('admin.tools.wordcloud.title') %>
<div class="text-center mt-2">
<small><i class="fa fa-search-plus"></i> <%= t('admin.tools.wordcloud.click_to_enlarge') %></small>
</div>
<% end %>
<% if @wordcloud_generated_at %>
<div class="text-muted mt-2">
<small><i class="fa fa-clock-o"></i> <%= t('admin.tools.wordcloud.generated_at', time: l(@wordcloud_generated_at, format: :long)) %></small>
</div>
<% end %>
</div>
<% end %>
<div class="instructions">
<p><%= t('admin.tools.wordcloud.instructions') %></p>
</div>
<%= render 'admin/tools/wordcloud/form' %>
</div>
</div>
</div>
<div class="col-md-4">
<div class="panel panel-default">
<div class="panel-heading">
<h3 class="panel-title"><%= t('admin.tools.wordcloud.top_words') %></h3>
</div>
<div class="panel-body">
<% if @top_words && @top_words.any? %>
<ol>
<% @top_words.each do |word, count| %>
<li><strong><%= word %></strong>: <%= count %></li>
<% end %>
</ol>
<% else %>
<p class="text-muted"><%= t('admin.tools.wordcloud.top_words_empty') %></p>
<% end %>
</div>
</div>
</div>
</div>
</div>
</div>

View file

@ -0,0 +1,93 @@
<div class="container-fluid">
<div class="row">
<div class="col-12">
<h1>WordCloud Generation Progress</h1>
<div class="card mb-4">
<div class="card-body">
<div class="progress mb-3">
<div id="progress-bar" class="progress-bar" role="progressbar" style="width: <%= @progress_data[:progress] %>%;"
aria-valuenow="<%= @progress_data[:progress] %>" aria-valuemin="0" aria-valuemax="100">
<%= @progress_data[:progress] %>%
</div>
</div>
<div id="status-message">
<% case @progress_data[:status] %>
<% when 'not_started' %>
<div class="alert alert-info">Waiting to start processing...</div>
<% when 'processing' %>
<div class="alert alert-info">Processing in progress...</div>
<% when 'completed' %>
<div class="alert alert-success">
WordCloud generation completed!
<%= link_to "View WordCloud", admin_tools_wordcloud_path, class: "btn btn-primary" %>
</div>
<% when 'failed' %>
<div class="alert alert-danger">
Error: <%= @progress_data[:error] || "Unknown error occurred" %>
</div>
<% end %>
</div>
<div class="mt-3">
<%= link_to "Back to Tools", admin_tools_path, class: "btn btn-secondary" %>
</div>
</div>
</div>
</div>
</div>
</div>
<script>
document.addEventListener('DOMContentLoaded', function() {
// Only poll if the process is not completed or failed
if ("<%= @progress_data[:status] %>" !== "completed" && "<%= @progress_data[:status] %>" !== "failed") {
pollProgress();
}
});
function pollProgress() {
const progressBar = document.getElementById('progress-bar');
const statusMessage = document.getElementById('status-message');
// Poll every 2 seconds
setInterval(function() {
fetch('<%= status_admin_tools_wordcloud_path %>')
.then(response => response.json())
.then(data => {
// Update progress bar
progressBar.style.width = data.progress + '%';
progressBar.setAttribute('aria-valuenow', data.progress);
progressBar.textContent = data.progress + '%';
// Update status message
let statusHtml = '';
switch(data.status) {
case 'not_started':
statusHtml = '<div class="alert alert-info">Waiting to start processing...</div>';
break;
case 'processing':
statusHtml = '<div class="alert alert-info">Processing in progress...</div>';
break;
case 'completed':
statusHtml = '<div class="alert alert-success">WordCloud generation completed! ' +
'<a href="<%= admin_tools_wordcloud_path %>">View WordCloud</a></div>';
// Redirect after a short delay
setTimeout(function() {
window.location.href = '<%= admin_tools_wordcloud_path %>';
}, 2000);
break;
case 'failed':
statusHtml = '<div class="alert alert-danger">Error: ' +
(data.error || "Unknown error occurred") + '</div>';
break;
}
statusMessage.innerHTML = statusHtml;
})
.catch(error => {
console.error('Error polling progress:', error);
});
}, 2000);
}
</script>

View file

@ -54,6 +54,10 @@ Rails.application.configure do
# Use a different cache store in production.
# config.cache_store = :mem_cache_store
config.cache_store = :redis_cache_store, {
url: "#{ENV.fetch('REDIS_URL', 'redis://localhost:6379')}/1",
expires_in: 300.seconds
}
# Use a real queuing backend for Active Job (and separate queues per environment)
config.active_job.queue_adapter = :sidekiq

View file

@ -2,7 +2,7 @@ require 'sidekiq/web' # Require at the top of the initializer
Sidekiq.configure_server do |config|
config.logger.level = Logger::INFO
# Custom job logging format
Sidekiq.logger.formatter = proc do |severity, datetime, progname, msg|
thread_id = Thread.current.object_id.to_s(36)

View file

@ -0,0 +1,42 @@
en:
admin:
tools:
title: "Administrative Tools"
available_tools: "Available Tools"
wordcloud_generator: "Domain Name Wordcloud Generator"
wordcloud_generator_description: "Generate a visual wordcloud from domain names in the registry"
generate_wordcloud: "Generate Wordcloud"
wordcloud:
title: "Domain Name Wordcloud"
success: "Wordcloud generated successfully"
error: "Error generating wordcloud"
processing: "Processing domain names. This may take a few minutes..."
instructions: "Generate a visual representation of the most common words in domain names. Click the button below to create the wordcloud."
top_words: "Top Words"
top_words_empty: "Generate a wordcloud to see the most frequent words."
click_to_enlarge: "Click to enlarge"
view_full_size: "View full size wordcloud image"
use_custom_domains: "Use custom domain list"
custom_file_description: "Upload a CSV file with one domain name per line"
file_upload_error: "Error processing uploaded file"
file_optional: "If no file is uploaded, all active domains in the registry will be used"
generated_at: "Generated at %{time}"
no_file: "No domain names found"
width: "Width"
height: "Height"
max_words: "Max Words"
background: "Background"
additional_stopwords: "Stopwords"
stopwords_placeholder: "Enter additional stopwords, one per line"
stopwords_help: "Stopwords are words that will not be included in the wordcloud"
advanced_options: "Advanced Options"
min_word_length: "Min Word Length"
include_numbers: "Include Numbers"
special_terms: "Special Terms"
special_terms_placeholder: "e.g., e-, i-, .com, ai, web"
special_terms_help: "These terms will be preserved in the word cloud even if they would normally be filtered out"
batch_size: "Batch Size"
batch_size_help: "Number of domains to process in each API call."
additional_prompt: "Additional Prompt Text"
additional_prompt_placeholder: "Add any additional instructions for the word cloud generation here..."
additional_prompt_help: "Optional text that will be used as additional context during word cloud generation."

View file

@ -229,6 +229,7 @@ en:
valid_from: 'Valid from'
general: 'General'
contacts: 'Contacts'
tools: 'Tools'
identity_code: 'Identity code'
nameservers: 'Nameservers'
hostname: 'Hostname'

View file

@ -250,6 +250,17 @@ Rails.application.routes.draw do
end
# post 'admi/upload_spreadsheet', to: 'customers#upload_spreadsheet', as: :customers_upload_spreadsheet
resources :tools, only: %i[index]
namespace :tools do
resource :wordcloud, controller: 'wordcloud', only: %i[create] do
collection do
get '', to: 'wordcloud#index'
get 'progress', to: 'wordcloud#progress', as: :progress
get 'status', to: 'wordcloud#status', as: :status
end
end
end
resources :bank_statements do
resources :bank_transactions

Binary file not shown.

View file

@ -0,0 +1,351 @@
#!/usr/bin/env python3
import os
import re
import sys
import json
import random
import asyncio
import numpy as np
from PIL import Image
from os import path
from wordcloud import WordCloud, STOPWORDS
from openai import AsyncOpenAI
import matplotlib.pyplot as plt
# import pandas as pd
from dotenv import load_dotenv
load_dotenv()
BATCH_SIZE = int(os.environ.get("OPENAI_BATCH_SIZE", "20"))
def load_system_prompt():
"""Loads system prompt from system_prompt.md file"""
prompt_file = path.join(path.dirname(__file__), 'system_prompt.md')
if not path.exists(prompt_file):
raise FileNotFoundError(f"System prompt not found at {prompt_file}. Please create the file.")
with open(prompt_file, 'r', encoding='utf-8') as f:
system_prompt = f.read()
return system_prompt
d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
output_dir = sys.argv[2] if len(sys.argv) > 2 else d
try:
SYSTEM_PROMPT = load_system_prompt()
print("System prompt successfully loaded from file.")
except FileNotFoundError as e:
print(f"Error: {e}")
sys.exit(1)
# Load configuration if provided
config = {}
if len(sys.argv) > 3 and sys.argv[3]:
config_file = sys.argv[3]
if path.exists(config_file):
with open(config_file, 'r') as f:
config = json.load(f)
print(f"Loaded configuration: {config}")
# Check if domains file path is provided and exists
if len(sys.argv) > 1 and sys.argv[1]:
domains_file = sys.argv[1]
if not path.exists(domains_file):
print(f"Error: Provided domains file {domains_file} not found")
sys.exit(1)
else:
print(f"Error: Domains file not found")
sys.exit(1)
# Read domain names from the file
with open(domains_file, 'r', encoding='utf-8') as f:
domain_names = [line.strip().lower() for line in f if line.strip()]
if not domain_names:
print("Error: No domain names found in the provided file")
sys.exit(1)
# Function to extract words using OpenAI API asynchronously
async def extract_words_with_openai(domain_names, batch_size=BATCH_SIZE):
filtered_domains = []
# Filter out domains that are only numbers
for domain in domain_names:
domain_core = domain.lower().replace('www.', '')
main_part = domain_core.split('.')[0]
if not main_part.isdigit():
filtered_domains.append(domain)
# Get API key from environment variable
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
raise ValueError("OpenAI API key not found. Set the OPENAI_API_KEY environment variable.")
# Initialize AsyncOpenAI client
client = AsyncOpenAI(api_key=api_key)
# Get model and temperature from environment variables
model = os.environ.get("OPENAI_MODEL", "gpt-4o-2024-11-20")
temperature = float(os.environ.get("OPENAI_TEMPERATURE", "0"))
max_tokens = int(os.environ.get("OPENAI_MAX_TOKENS", "16000"))
# Process domains in batches
all_words = []
total_prompt_tokens = 0
total_completion_tokens = 0
total_cost = 0
# Calculate number of batches
num_batches = (len(filtered_domains) + batch_size - 1) // batch_size
# Create semaphore to limit concurrent requests
semaphore = asyncio.Semaphore(10) # Limit to 10 concurrent requests
async def process_batch(batch_idx):
async with semaphore:
start_idx = batch_idx * batch_size
end_idx = min(start_idx + batch_size, len(filtered_domains))
batch = filtered_domains[start_idx:end_idx]
print(f"Processing batch {batch_idx + 1}/{num_batches} ({len(batch)} domains)...")
sys.stdout.flush()
# Prepare the prompt with domain names and special terms
domains_text = "\n".join(batch)
prompt = f"List of domain names: {domains_text}"
# Make the API call
try:
print(f"Using model: {model} with temperature: {temperature}")
response = await client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": prompt}
],
response_format={
"type": "json_schema",
"json_schema": {
"name": "domain_analysis_results",
"strict": True,
"schema": {
"type": "object",
"properties": {
"results": {
"type": "array",
"description": "A list of analysis results for the provided domains.",
"items": {
"type": "object",
"properties": {
"Language": {
"type": "string",
"description": "The language identified in the domain name."
},
"is_splitted": {
"type": "string",
"description": "Indicates whether the domain name is split into recognizable words."
},
"reasoning": {
"type": "string",
"description": "Explanation of the reasoning behind the language and word identification."
},
"words": {
"type": "array",
"description": "The words identified in the domain name.",
"items": {
"type": "string"
}
}
},
"required": [
"Language",
"is_splitted",
"reasoning",
"words"
],
"additionalProperties": False
}
}
},
"required": [
"results"
],
"additionalProperties": False
}
}
},
temperature=temperature,
max_tokens=max_tokens,
)
# Track token usage
prompt_tokens = response.usage.prompt_tokens
completion_tokens = response.usage.completion_tokens
total_tokens = response.usage.total_tokens
nonlocal total_prompt_tokens, total_completion_tokens
total_prompt_tokens += prompt_tokens
total_completion_tokens += completion_tokens
print(f"Token usage - Prompt: {prompt_tokens}, Completion: {completion_tokens}, Total: {total_tokens}")
# Calculate cost (approximate, based on current pricing)
if "gpt-4.1" in model:
prompt_cost = (prompt_tokens / 1000000) * 2.00 # $2.00 per 1M tokens for GPT-4.1 input
completion_cost = (completion_tokens / 1000000) * 8.00 # $8.00 per 1M tokens for GPT-4.1 output
else:
prompt_cost = 0
completion_cost = 0
batch_cost = prompt_cost + completion_cost
nonlocal total_cost
total_cost += batch_cost
print(f"Estimated batch cost: ${batch_cost:.6f}")
# Extract the words from the response
response_json = json.loads(response.choices[0].message.content)
batch_words = []
for result in response_json['results']:
if result['Language'] == 'Ignore':
continue
batch_words.extend(result['words'])
print(f"Extracted {len(batch_words)} words from this batch")
return batch_words
except Exception as e:
print(f"Error calling OpenAI API for batch: {e}")
return []
# Create tasks for each batch
tasks = []
for batch_idx in range(num_batches):
tasks.append(process_batch(batch_idx))
# Run all tasks concurrently and wait for results
batch_results = await asyncio.gather(*tasks)
# Combine all words from all batches
for batch_words in batch_results:
all_words.extend(batch_words)
print(f"Total token usage - Prompt: {total_prompt_tokens}, Completion: {total_completion_tokens}")
print(f"Total estimated cost: ${total_cost:.6f}")
return all_words
# Replace the synchronous call with an async function
async def main():
# Process domain names using OpenAI
print("Extracting words from domain names using OpenAI...")
extracted_words = await extract_words_with_openai(domain_names)
print(f"Extracted {len(extracted_words)} words")
# Join the extracted words for the word cloud
processed_text = ' '.join(extracted_words)
def custom_color_func(word, font_size, position, orientation, random_state=None,
**kwargs):
return "hsl(215, 100%%, %d%%)" % random.randint(15, 80)
mask = np.array(Image.open(path.join(d, 'mask.png')))
# Get configuration values with defaults
width = int(config.get('width', 800))
height = int(config.get('height', 800))
max_words = int(config.get('max_words', 500))
background_color = config.get('background_color', 'white')
min_word_length = int(config.get('min_word_length', 2))
include_numbers = config.get('include_numbers', True)
# Handle transparent background
if background_color == 'transparent':
background_color = None
# Get additional stopwords
additional_stopwords = config.get('additional_stopwords', [])
stopwords = set(STOPWORDS)
stopwords = {
'ja', 'ning', 'et', 'kui', 'aga', 'ka', 'ei', 'see', 'on', 'ole',
'oma', 'seda', 'siis', 'või', 'mis', 'nii', 'veel', 'kes', 'üle',
'välja', 'olema', 'kus', 'nagu', 'kuid', 'selle', 'pole', 'ära',
'vaid', 'sest', 'juba', 'meie', 'mida', 'need', 'olid', 'minu',
'tema', 'pärast', 'mingi', 'palju', 'kõik', 'seal', 'olen', 'oled',
'oli', 'olnud', 'ongi', 'poolt', 'meil', 'teda', 'just', 'kuna',
'läbi', 'küll',
'the', 'and', 'a', 'to', 'of', 'in', 'is', 'that', 'it', 'for',
'with', 'as', 'be', 'on', 'not', 'this', 'but', 'by', 'from', 'are',
'or', 'an', 'at', 'was', 'have', 'has', 'had', 'were', 'will', 'would',
'should', 'can', 'could', 'may', 'might', 'must', 'do', 'does', 'did',
'doing', 'done', 'their', 'they', 'them', 'there', 'these', 'those',
'which', 'who', 'whom', 'whose', 'what', 'when', 'where', 'why', 'how'
}
stopwords.update(stopwords)
stopwords.update(additional_stopwords)
font_path = path.join(d, 'fonts', 'Pacifico-Regular.ttf')
# Alternative: use a system font
# font_path = fm.findfont(fm.FontProperties(family='Arial'))
print("Generating word cloud...")
wc = WordCloud(width=width, height=height,
mask=mask,
stopwords=stopwords,
background_color=background_color,
max_words=max_words,
include_numbers=include_numbers,
collocations=False,
min_word_length=min_word_length,
regexp=r"[A-Za-zÕÄÖÜõäöü0-9][\w\-'ÕÄÖÜõäöü]*(?<!\.ee)(?<!ee)",
font_path=font_path)
wc.generate(processed_text)
# Get word frequencies from the word cloud
word_frequencies = wc.process_text(processed_text)
# Remove stopwords from the frequencies
word_frequencies = {word: freq for word, freq in word_frequencies.items()
if word.lower() not in stopwords}
# Sort words by frequency (highest first)
sorted_words = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)
# Get top 10 words
top_10_words = sorted_words[:10]
# Print top 10 words to console
print("\nTop 10 most frequent words:")
for word, freq in top_10_words:
print(f"{word}: {freq}")
# Save top 10 words to a text file
top_words_file = path.join(output_dir, 'top_words.txt')
with open(top_words_file, 'w', encoding='utf-8') as f:
f.write("Top 10 most frequent words:\n")
for i, (word, freq) in enumerate(top_10_words, 1):
f.write(f"{i}. {word}: {freq}\n")
print(f"\nTop words saved to {top_words_file}")
# store default colored image
default_colors = wc.to_array()
# Display the word cloud
plt.imshow(wc.recolor(color_func=custom_color_func, random_state=3),
interpolation="bilinear")
plt.axis('off')
plt.show()
# Save the word cloud to file
wc.to_file(path.join(output_dir, 'wordcloud.png'))
# Call the async main function
if __name__ == "__main__":
# Run the async main function
asyncio.run(main())

BIN
lib/wordcloud/mask.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB

View file

@ -0,0 +1,89 @@
You are a bilinear Estonian-English linguist and word-segmentation expert.
Your task is to identify which word or words a domain name consists of. You only work with English and Estonian words.
### INSTRUCTION
**Key “Language”**
You must determine the language of the domain name. The domain name can be a single word or several words. You have 3 options: Estonian, English, Ignore.
- Ignore the protocol, the leading “www.” sub-domain (if present) and the top-level domain (e.g. “.ee”, “.com”) they never influence language detection.
- If the domain consists of numbers, random letters, abbreviations, personal names, or is a transliteration from another language (for example, mnogoknig.ee from Russian), you should choose “Ignore” for Language.
- Otherwise, use a longest-match left-to-right lookup against (1) an Estonian core-vocabulary list, (2) a general English dictionary, (3) a whitelist of well-known abbreviations such as BMW, CAD, NGO, AI, EE. Whichever language supplies the majority of matched tokens becomes the value of Language.
- When tokens from both languages are present in roughly equal measure, choose the language that appears first in the domain string.
**Key “is_splitted”**
Here you must specify whether the domain name consists of more than one word.
- Treat a digit boundary (letter → digit or digit → letter) as an automatic split; the digit itself counts as a separate token.
- Treat a change of language (Estonian token followed by English token, or vice versa) as a split.
- Hyphens “-” or underscores “_” (even though rare in .ee domains) are explicit boundaries.
- Even if the domain includes an Estonian word plus an abbreviation, acronym or number, you still set “is_splitted” to true.
**Key “reasoning”**
Here, you should reason about which exact words and abbreviations make up the domain name.
- Work left → right, applying longest-match dictionary look-ups; if no match is possible and the fragment is ≤ 3 letters, treat it as an abbreviation; if it is longer, treat it as nonsense and set Language = Ignore.
- When you recognise an Estonian morphological ending (-id, -ed, -us, -ja, -jad, -te), peel it off and explain the root plus ending in the reasoning.
- If Language is Ignore, simply write “Ignore”. Otherwise, for every recognised word, abbreviation, symbol or number give a short definition or plausible meaning.
**Key “words”**
Based on the reasoning above, list only the words and tokens that make up the domain, in the order they appear.
- Omit “www”, TLDs and any punctuation.
- Keep digits as separate tokens (e.g. auto24.ee → “auto”, “24”).
- For fragments treated as abbreviations include the abbreviation exactly as it appears (“BMW”, “CAD”).
- If Language = Ignore, leave the array empty.
### EXAMPLES OF SPLITTING WORDS:
advanceautokool.ee: advance, auto, kool
1autosuvila.ee: auto, suvila
autoaks.ee: auto
autoeis.ee: auto
autoklaasitehnik.ee: auto, klaas, tehnik
autokoolmegalinn.ee: auto, kool, mega, linn
autoly.ee: auto
automatiseeri.ee: auto
autonova.ee: auto, nova
autor.ee: autor
autost24.ee: Auto, 24
eestiaiandus.ee: eesti, aiandus
eestiastelpaju.ee: eesti, astelpaju
eestiloomekoda.ee: eesti, loomekoda
eestimadrats.ee: eesti, madrats
eestiost.ee: eesti, ost
eestipinglaed.ee: eesti, pinglaed
eestirohelineelu.ee: eesti, roheline, elu
eestiterviseuudised.ee: eesti, tervise, uudised
eheeesti.ee: ehe, eesti
ehitusliiv.ee: ehitus, liiv
ehitusgeodeesia.ee: ehitus, geodeesia
ehitusakadeemia.ee: ehitus, akadeemia
ehitusoutlet1.ee: ehitus, outlet
enpeehitus.ee: ehitus
eramuteehitus.ee: eramu, ehitus
fstehitus.ee: ehitus
hkehitusekspertiisid.ee: ehitus, ekspert
kronestehitus.ee: est, ehitus
makeehituspartner.ee: make, ehitus, partner
masirent.ee: rent
montessorirent.ee: montessoor, rent
paadirent1.ee: paadi, rent
pakiautorent.ee: paki, auto, rent
pixover.ee: pix, over
pixrent.ee: pix, rent
rentafriend.ee: rent, friend
rentbmw.ee: rent, bmw
reservrent.ee: reserv, rent
rentellix.ee: rent, ellix?
valmismajad.ee: valmis, maja
eramajadehooldus.ee: eramaja, hooldus
mastimajad.ee: mast, maja
nupsikpood.ee: nupsik, pood
poodcolordeco.ee: pood, color, deco
tarantlipood.ee: tarantli, pood
alyanstorupood.ee: toru, pood
arriumtech.ee: arrium, tech
xeniustech.ee: xenius, tech
whitechem.ee: white, chem
techme.ee: tech, me
techcad.ee: tech, cad
estonianharbours.ee: estonia, harbour
estonianspl.ee: estonia
hauratonestonia.ee: hauraton, estonia
koerahoidjatartus.ee: koer, hoidja, tartu
terrassidtartus.ee: terrass, tartu

View file

@ -0,0 +1 @@
{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","additional_prompt":null,"special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]}

View file

@ -0,0 +1,11 @@
Top 10 most frequent words:
1. auto: 80
2. eesti: 65
3. 24: 62
4. ehitus: 43
5. rent: 36
6. shop: 34
7. estonia: 30
8. pood: 27
9. tech: 27
10. tartu: 24

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 MiB

View file

@ -0,0 +1 @@
{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","additional_prompt":null,"special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]}

View file

@ -0,0 +1 @@
{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","additional_prompt":null,"special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]}

View file

@ -0,0 +1 @@
{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","additional_prompt":null,"special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]}

View file

@ -0,0 +1 @@
{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","additional_prompt":null,"special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]}