mirror of
https://github.com/internetee/registry.git
synced 2025-08-16 06:23:57 +02:00
Merge 90cafc73c0
into c8a0788366
This commit is contained in:
commit
42b6e792b3
28 changed files with 1163 additions and 2 deletions
11
Dockerfile
11
Dockerfile
|
@ -62,6 +62,16 @@ RUN apt-get install -y --no-install-recommends > /dev/null \
|
||||||
&& apt-get clean \
|
&& apt-get clean \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install Python packages for wordcloud generation
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3-pip \
|
||||||
|
python3-setuptools \
|
||||||
|
python3-dev \
|
||||||
|
&& pip3 install --upgrade pip setuptools wheel \
|
||||||
|
&& pip3 install --no-cache-dir numpy Pillow matplotlib wordcloud openai dotenv \
|
||||||
|
&& apt-get clean \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN apt-get autoremove -y && apt-get clean
|
RUN apt-get autoremove -y && apt-get clean
|
||||||
|
|
||||||
ENV CHROME_VERSION="128.0.6613.137"
|
ENV CHROME_VERSION="128.0.6613.137"
|
||||||
|
@ -95,7 +105,6 @@ ENV PATH="/opt/chrome-linux64:${PATH}"
|
||||||
|
|
||||||
RUN ln -s /lib/ld-linux.so.2 /lib/ld-linux.so.2 || true
|
RUN ln -s /lib/ld-linux.so.2 /lib/ld-linux.so.2 || true
|
||||||
|
|
||||||
# Обертка для wkhtmltopdf с xvfb
|
|
||||||
RUN echo '#!/bin/bash\nxvfb-run -a --server-args="-screen 0, 1024x768x24" /usr/bin/wkhtmltopdf "$@"' > /usr/local/bin/wkhtmltopdf \
|
RUN echo '#!/bin/bash\nxvfb-run -a --server-args="-screen 0, 1024x768x24" /usr/bin/wkhtmltopdf "$@"' > /usr/local/bin/wkhtmltopdf \
|
||||||
&& chmod +x /usr/local/bin/wkhtmltopdf
|
&& chmod +x /usr/local/bin/wkhtmltopdf
|
||||||
|
|
||||||
|
|
3
Gemfile
3
Gemfile
|
@ -113,3 +113,6 @@ gem 'net-ftp'
|
||||||
|
|
||||||
# https://stackoverflow.com/questions/79360526/uninitialized-constant-activesupportloggerthreadsafelevellogger-nameerror
|
# https://stackoverflow.com/questions/79360526/uninitialized-constant-activesupportloggerthreadsafelevellogger-nameerror
|
||||||
gem 'concurrent-ruby', '1.3.4'
|
gem 'concurrent-ruby', '1.3.4'
|
||||||
|
|
||||||
|
# gives you access to stdin, stdout, and stderr when running other programs
|
||||||
|
gem 'open3'
|
|
@ -394,6 +394,7 @@ GEM
|
||||||
omniauth-rails_csrf_protection (0.1.2)
|
omniauth-rails_csrf_protection (0.1.2)
|
||||||
actionpack (>= 4.2)
|
actionpack (>= 4.2)
|
||||||
omniauth (>= 1.3.1)
|
omniauth (>= 1.3.1)
|
||||||
|
open3 (0.2.1)
|
||||||
openid_connect (1.4.2)
|
openid_connect (1.4.2)
|
||||||
activemodel
|
activemodel
|
||||||
attr_required (>= 1.0.0)
|
attr_required (>= 1.0.0)
|
||||||
|
@ -636,6 +637,7 @@ DEPENDENCIES
|
||||||
nokogiri (~> 1.16.0)
|
nokogiri (~> 1.16.0)
|
||||||
omniauth-rails_csrf_protection
|
omniauth-rails_csrf_protection
|
||||||
omniauth-tara!
|
omniauth-tara!
|
||||||
|
open3
|
||||||
openssl
|
openssl
|
||||||
paper_trail (~> 14.0)
|
paper_trail (~> 14.0)
|
||||||
pdfkit
|
pdfkit
|
||||||
|
|
175
app/controllers/admin/tools/wordcloud_controller.rb
Normal file
175
app/controllers/admin/tools/wordcloud_controller.rb
Normal file
|
@ -0,0 +1,175 @@
|
||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
module Admin
|
||||||
|
module Tools
|
||||||
|
# Controller for the admin wordcloud generator tool that creates visual representations
|
||||||
|
# of the most common words used in domain names
|
||||||
|
class WordcloudController < BaseController # rubocop:disable Metrics/ClassLength
|
||||||
|
WORDCLOUD_DIR = Rails.root.join('public', 'wordcloud')
|
||||||
|
WORDCLOUD_IMAGE_PATH = WORDCLOUD_DIR.join('wordcloud.png')
|
||||||
|
WORDCLOUD_CONFIG_PATH = WORDCLOUD_DIR.join('config.json')
|
||||||
|
TOP_WORDS_PATH = WORDCLOUD_DIR.join('top_words.txt')
|
||||||
|
|
||||||
|
before_action :authorize_admin
|
||||||
|
before_action :clear_cache, only: :create
|
||||||
|
before_action :ensure_wordcloud_dir, only: :create
|
||||||
|
|
||||||
|
def index
|
||||||
|
# Load configuration
|
||||||
|
@config = load_wordcloud_config
|
||||||
|
|
||||||
|
# Setup wordcloud data if image exists
|
||||||
|
if File.exist?(WORDCLOUD_IMAGE_PATH)
|
||||||
|
setup_wordcloud_data
|
||||||
|
else
|
||||||
|
@wordcloud_url = nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def create
|
||||||
|
# Validate domains file
|
||||||
|
if params[:domains_file].present?
|
||||||
|
domains_file_path = process_uploaded_file(params[:domains_file])
|
||||||
|
return redirect_to admin_tools_wordcloud_path if domains_file_path.nil?
|
||||||
|
else
|
||||||
|
flash[:alert] = I18n.t('admin.tools.wordcloud_no_file')
|
||||||
|
return redirect_to admin_tools_wordcloud_path
|
||||||
|
end
|
||||||
|
|
||||||
|
# Collect and save configuration
|
||||||
|
config = build_config_from_params
|
||||||
|
File.write(WORDCLOUD_CONFIG_PATH, config.to_json)
|
||||||
|
|
||||||
|
# Start the background job
|
||||||
|
GenerateWordCloudJob.perform_later(domains_file_path.to_s, current_admin_user.id, config)
|
||||||
|
redirect_to progress_admin_tools_wordcloud_path
|
||||||
|
|
||||||
|
rescue StandardError => e
|
||||||
|
logger.error "Error starting wordcloud generation: #{e.message}"
|
||||||
|
flash[:alert] = "#{I18n.t('admin.tools.wordcloud_error')}: #{e.message}"
|
||||||
|
redirect_to admin_tools_wordcloud_path
|
||||||
|
end
|
||||||
|
|
||||||
|
# GET /admin/tools/wordcloud/progress
|
||||||
|
def progress
|
||||||
|
@progress_key = "wordcloud_progress:#{current_admin_user.id}"
|
||||||
|
@progress_data = Rails.cache.fetch(@progress_key) || { status: 'not_started', progress: 0 }
|
||||||
|
end
|
||||||
|
|
||||||
|
# GET /admin/tools/wordcloud/status
|
||||||
|
def status
|
||||||
|
progress_key = "wordcloud_progress:#{current_admin_user.id}"
|
||||||
|
progress_data = Rails.cache.fetch(progress_key) || { status: 'not_started', progress: 0 }
|
||||||
|
|
||||||
|
render json: progress_data
|
||||||
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def ensure_wordcloud_dir
|
||||||
|
FileUtils.mkdir_p(WORDCLOUD_DIR) unless Dir.exist?(WORDCLOUD_DIR)
|
||||||
|
end
|
||||||
|
|
||||||
|
def process_uploaded_file(uploaded_file)
|
||||||
|
# Create a persistent copy of the uploaded file
|
||||||
|
persistent_file_path = Rails.root.join('tmp', "domains_#{Time.now.to_i}.csv")
|
||||||
|
|
||||||
|
# Copy the file content to a persistent location
|
||||||
|
FileUtils.cp(uploaded_file.tempfile.path, persistent_file_path)
|
||||||
|
|
||||||
|
# Validate file has content
|
||||||
|
if File.size(persistent_file_path).zero?
|
||||||
|
File.delete(persistent_file_path)
|
||||||
|
flash[:alert] = I18n.t('admin.tools.wordcloud_empty_file')
|
||||||
|
return nil
|
||||||
|
end
|
||||||
|
|
||||||
|
persistent_file_path
|
||||||
|
end
|
||||||
|
|
||||||
|
def build_config_from_params
|
||||||
|
# Base configuration
|
||||||
|
config = {
|
||||||
|
width: params[:width].presence || 800,
|
||||||
|
height: params[:height].presence || 800,
|
||||||
|
max_words: params[:max_words].presence || 500,
|
||||||
|
background_color: params[:background_color].presence || 'white',
|
||||||
|
min_word_length: params[:min_word_length].presence || 2,
|
||||||
|
include_numbers: params[:include_numbers] == '1',
|
||||||
|
batch_size: params[:batch_size].presence || 500,
|
||||||
|
additional_prompt: params[:additional_prompt].presence || nil
|
||||||
|
}
|
||||||
|
|
||||||
|
# Process additional stopwords
|
||||||
|
if params[:additional_stopwords].present?
|
||||||
|
stopwords = params[:additional_stopwords].downcase.split(/[\s,]+/).reject(&:empty?)
|
||||||
|
config[:additional_stopwords] = stopwords if stopwords.any?
|
||||||
|
end
|
||||||
|
|
||||||
|
# Process special terms
|
||||||
|
if params[:special_terms].present?
|
||||||
|
special_terms = params[:special_terms].split(/[\s,]+/).reject(&:empty?)
|
||||||
|
config[:special_terms] = special_terms if special_terms.any?
|
||||||
|
end
|
||||||
|
|
||||||
|
config
|
||||||
|
end
|
||||||
|
|
||||||
|
def load_wordcloud_config
|
||||||
|
if File.exist?(WORDCLOUD_CONFIG_PATH)
|
||||||
|
begin
|
||||||
|
JSON.parse(File.read(WORDCLOUD_CONFIG_PATH))
|
||||||
|
rescue JSON::ParserError
|
||||||
|
default_wordcloud_config
|
||||||
|
end
|
||||||
|
else
|
||||||
|
default_wordcloud_config
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def setup_wordcloud_data
|
||||||
|
# Add timestamp to prevent caching
|
||||||
|
@wordcloud_url = "/wordcloud/wordcloud.png?t=#{File.mtime(WORDCLOUD_IMAGE_PATH).to_i}"
|
||||||
|
|
||||||
|
# Get the file's modification time and convert to application timezone
|
||||||
|
@wordcloud_generated_at = File.mtime(WORDCLOUD_IMAGE_PATH).in_time_zone(Time.zone)
|
||||||
|
|
||||||
|
# Load top words
|
||||||
|
load_top_words
|
||||||
|
end
|
||||||
|
|
||||||
|
def load_top_words
|
||||||
|
return unless File.exist?(TOP_WORDS_PATH)
|
||||||
|
|
||||||
|
@top_words = []
|
||||||
|
File.readlines(TOP_WORDS_PATH).each do |line|
|
||||||
|
if line =~ /^\d+\.\s+(\w+):\s+(\d+)$/
|
||||||
|
@top_words << [$1, $2.to_i]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def default_wordcloud_config
|
||||||
|
{
|
||||||
|
'width' => 800,
|
||||||
|
'height' => 800,
|
||||||
|
'max_words' => 500,
|
||||||
|
'background_color' => 'white',
|
||||||
|
'additional_stopwords' => [],
|
||||||
|
'include_numbers' => true,
|
||||||
|
'min_word_length' => 2,
|
||||||
|
'special_terms' => ['e-', 'i-', '2-', '3-', '4-', '.com', 'tr.ee', 'ai', 'web'],
|
||||||
|
'batch_size' => 500
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
def authorize_admin
|
||||||
|
authorize! :access, :tools
|
||||||
|
end
|
||||||
|
|
||||||
|
def clear_cache
|
||||||
|
Rails.cache.delete("wordcloud_progress:#{current_admin_user.id}")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
14
app/controllers/admin/tools_controller.rb
Normal file
14
app/controllers/admin/tools_controller.rb
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
module Admin
|
||||||
|
class ToolsController < BaseController
|
||||||
|
before_action :authorize_admin
|
||||||
|
|
||||||
|
# GET /admin/tools
|
||||||
|
def index; end
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def authorize_admin
|
||||||
|
authorize! :access, :tools
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
119
app/jobs/generate_word_cloud_job.rb
Normal file
119
app/jobs/generate_word_cloud_job.rb
Normal file
|
@ -0,0 +1,119 @@
|
||||||
|
# Use Open3 to capture output in real-time
|
||||||
|
require 'open3'
|
||||||
|
|
||||||
|
# Background job that generates a wordcloud image from domain names
|
||||||
|
# using an external Python script with progress tracking
|
||||||
|
class GenerateWordCloudJob < ApplicationJob
|
||||||
|
def perform(domains_file_path, user_id, config = {})
|
||||||
|
|
||||||
|
Rails.logger.info("Generating wordcloud for #{domains_file_path}")
|
||||||
|
|
||||||
|
@domains_file_path = domains_file_path
|
||||||
|
@user_id = user_id
|
||||||
|
@config = config
|
||||||
|
@progress_key = "wordcloud_progress:#{user_id}"
|
||||||
|
@wordcloud_dir = Rails.root.join('public', 'wordcloud')
|
||||||
|
@config_file_path = nil
|
||||||
|
|
||||||
|
initialize_progress
|
||||||
|
|
||||||
|
begin
|
||||||
|
setup_environment
|
||||||
|
run_wordcloud_script
|
||||||
|
rescue StandardError => e
|
||||||
|
handle_error(e)
|
||||||
|
ensure
|
||||||
|
cleanup
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def initialize_progress
|
||||||
|
Rails.cache.write(@progress_key, { status: 'processing', progress: 0 })
|
||||||
|
end
|
||||||
|
|
||||||
|
def setup_environment
|
||||||
|
# Ensure the wordcloud directory exists
|
||||||
|
FileUtils.mkdir_p(@wordcloud_dir) unless Dir.exist?(@wordcloud_dir)
|
||||||
|
|
||||||
|
# Create a config file for the Python script
|
||||||
|
@config_file_path = Rails.root.join(@wordcloud_dir, "wordcloud_config_#{Time.now.to_i}.json")
|
||||||
|
File.write(@config_file_path, @config.to_json)
|
||||||
|
|
||||||
|
# Setup Python script
|
||||||
|
@script_path = Rails.root.join('lib', 'wordcloud', 'generate_wordcloud.py')
|
||||||
|
FileUtils.chmod('+x', @script_path) unless File.executable?(@script_path)
|
||||||
|
end
|
||||||
|
|
||||||
|
def run_wordcloud_script
|
||||||
|
python_executable = ENV.fetch('PYTHON_EXECUTABLE', 'python3')
|
||||||
|
env = { 'PYTHONIOENCODING' => 'utf-8', 'PYTHONUNBUFFERED' => '1' }
|
||||||
|
|
||||||
|
Open3.popen2e(env, python_executable, @script_path.to_s, @domains_file_path,
|
||||||
|
@wordcloud_dir.to_s, @config_file_path.to_s) do |stdin, stdout_err, wait_thr|
|
||||||
|
stdin.close
|
||||||
|
process_script_output(stdout_err, wait_thr)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def process_script_output(stdout_err, wait_thr)
|
||||||
|
# Process output line by line
|
||||||
|
while line = stdout_err.gets
|
||||||
|
update_progress_from_output(line)
|
||||||
|
Rails.logger.info("WordCloud: #{line.strip}")
|
||||||
|
end
|
||||||
|
|
||||||
|
# Process exit status
|
||||||
|
handle_exit_status(wait_thr.value)
|
||||||
|
end
|
||||||
|
|
||||||
|
def update_progress_from_output(line)
|
||||||
|
case line
|
||||||
|
when %r{Processing batch (\d+)/(\d+)}
|
||||||
|
current, total = $1.to_i, $2.to_i
|
||||||
|
progress = ((current.to_f / total) * 80).round
|
||||||
|
update_progress(progress)
|
||||||
|
when /Total estimated cost/
|
||||||
|
update_progress(80)
|
||||||
|
when /Generating word cloud/
|
||||||
|
update_progress(90)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def update_progress(value, status: 'processing')
|
||||||
|
Rails.cache.write(@progress_key, { status: status, progress: value })
|
||||||
|
end
|
||||||
|
|
||||||
|
def handle_exit_status(exit_status)
|
||||||
|
if exit_status.success?
|
||||||
|
update_progress(100, status: 'completed')
|
||||||
|
else
|
||||||
|
Rails.cache.write(
|
||||||
|
@progress_key,
|
||||||
|
{
|
||||||
|
status: 'failed',
|
||||||
|
progress: 0,
|
||||||
|
error: "Process failed with status #{exit_status.exitstatus}"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def handle_error(exception)
|
||||||
|
Rails.logger.error("Error in WordCloud job: #{exception.message}")
|
||||||
|
Rails.logger.error(exception.backtrace.join("\n"))
|
||||||
|
Rails.cache.write(
|
||||||
|
@progress_key,
|
||||||
|
{
|
||||||
|
status: 'failed',
|
||||||
|
progress: 0,
|
||||||
|
error: exception.message
|
||||||
|
}
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
def cleanup
|
||||||
|
File.delete(@config_file_path) if @config_file_path && File.exist?(@config_file_path)
|
||||||
|
end
|
||||||
|
end
|
|
@ -121,6 +121,7 @@ class Ability
|
||||||
can :destroy, :pending
|
can :destroy, :pending
|
||||||
can :create, :zonefile
|
can :create, :zonefile
|
||||||
can :access, :settings_menu
|
can :access, :settings_menu
|
||||||
|
can :access, :tools
|
||||||
can :manage, :mass_actions
|
can :manage, :mass_actions
|
||||||
can :manage, BouncedMailAddress
|
can :manage, BouncedMailAddress
|
||||||
end
|
end
|
||||||
|
|
|
@ -6,6 +6,8 @@
|
||||||
%li= link_to t(:contacts), admin_contacts_path
|
%li= link_to t(:contacts), admin_contacts_path
|
||||||
- if can? :show, Registrar
|
- if can? :show, Registrar
|
||||||
%li= link_to t(:registrars), admin_registrars_path
|
%li= link_to t(:registrars), admin_registrars_path
|
||||||
|
- if can?(:access, :tools)
|
||||||
|
%li= link_to t(:tools), admin_tools_path
|
||||||
- if can?(:access, :settings_menu)
|
- if can?(:access, :settings_menu)
|
||||||
%li.dropdown
|
%li.dropdown
|
||||||
%a.dropdown-toggle{"data-toggle" => "dropdown", href: "#"}
|
%a.dropdown-toggle{"data-toggle" => "dropdown", href: "#"}
|
||||||
|
|
23
app/views/admin/tools/index.html.erb
Normal file
23
app/views/admin/tools/index.html.erb
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
<%= render "shared/title", name: t('admin.tools.title') %>
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-md-12">
|
||||||
|
<div class="panel panel-default">
|
||||||
|
<div class="panel-heading">
|
||||||
|
<h3 class="panel-title"><%= t('admin.tools.available_tools') %></h3>
|
||||||
|
</div>
|
||||||
|
<div class="panel-body">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-md-4">
|
||||||
|
<div class="well well-sm">
|
||||||
|
<h4><%= t('admin.tools.wordcloud_generator') %></h4>
|
||||||
|
<p><%= t('admin.tools.wordcloud_generator_description') %></p>
|
||||||
|
<%= link_to t('admin.tools.generate_wordcloud'), admin_tools_wordcloud_path, class: 'btn btn-primary' %>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<!-- Additional tools can be added here in similar well blocks -->
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
114
app/views/admin/tools/wordcloud/_form.html.erb
Normal file
114
app/views/admin/tools/wordcloud/_form.html.erb
Normal file
|
@ -0,0 +1,114 @@
|
||||||
|
<%= form_tag admin_tools_wordcloud_path, method: :post, multipart: true do %>
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-md-12">
|
||||||
|
<div class="form-group">
|
||||||
|
<div class="custom-file-upload">
|
||||||
|
<p class="text-muted"><%= t('admin.tools.wordcloud.custom_file_description') %></p>
|
||||||
|
<%= file_field_tag :domains_file, accept: '.csv', class: 'form-control' %>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="panel panel-default mt-3">
|
||||||
|
<div class="panel-heading">
|
||||||
|
<h4 class="panel-title">
|
||||||
|
<a data-toggle="collapse" href="#advancedOptions">
|
||||||
|
<i class="fa fa-cog"></i> <%= t('admin.tools.wordcloud.advanced_options') %>
|
||||||
|
</a>
|
||||||
|
</h4>
|
||||||
|
</div>
|
||||||
|
<div id="advancedOptions" class="panel-collapse collapse">
|
||||||
|
<div class="panel-body">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-md-4">
|
||||||
|
<div class="form-group">
|
||||||
|
<%= label_tag :width, t('admin.tools.wordcloud.width') %>
|
||||||
|
<%= number_field_tag :width, @config['width'], min: 400, max: 2000, step: 100, class: 'form-control' %>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="col-md-4">
|
||||||
|
<div class="form-group">
|
||||||
|
<%= label_tag :height, t('admin.tools.wordcloud.height') %>
|
||||||
|
<%= number_field_tag :height, @config['height'], min: 400, max: 2000, step: 100, class: 'form-control' %>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="col-md-4">
|
||||||
|
<div class="form-group">
|
||||||
|
<%= label_tag :max_words, t('admin.tools.wordcloud.max_words') %>
|
||||||
|
<%= number_field_tag :max_words, @config['max_words'], min: 100, max: 1000, step: 50, class: 'form-control' %>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-md-6">
|
||||||
|
<div class="form-group">
|
||||||
|
<%= label_tag :batch_size, t('admin.tools.wordcloud.batch_size') %>
|
||||||
|
<%= number_field_tag :batch_size, @config['batch_size'], min: 100, max: 1000, step: 50, class: 'form-control' %>
|
||||||
|
<small class="text-muted"><%= t('admin.tools.wordcloud.batch_size_help') %></small>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="col-md-6">
|
||||||
|
<div class="form-group">
|
||||||
|
<%= label_tag :background_color, t('admin.tools.wordcloud.background') %>
|
||||||
|
<%= select_tag :background_color,
|
||||||
|
options_for_select([
|
||||||
|
['White', 'white'],
|
||||||
|
['Black', 'black'],
|
||||||
|
['Transparent', 'transparent'],
|
||||||
|
['Light Gray', '#f0f0f0']
|
||||||
|
], @config['background_color']),
|
||||||
|
class: 'form-control' %>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-md-6">
|
||||||
|
<div class="form-group">
|
||||||
|
<%= label_tag :min_word_length, t('admin.tools.wordcloud.min_word_length') %>
|
||||||
|
<%= number_field_tag :min_word_length, @config['min_word_length'], min: 1, max: 5, class: 'form-control' %>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="col-md-6">
|
||||||
|
<div class="form-group">
|
||||||
|
<div class="checkbox" style="margin-top: 30px;">
|
||||||
|
<label>
|
||||||
|
<%= check_box_tag :include_numbers, '1', @config['include_numbers'] %>
|
||||||
|
<%= t('admin.tools.wordcloud.include_numbers') %>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="form-group">
|
||||||
|
<%= label_tag :special_terms, t('admin.tools.wordcloud.special_terms') %>
|
||||||
|
<%= text_field_tag :special_terms, @config['special_terms'].is_a?(Array) ? @config['special_terms'].join(', ') : '',
|
||||||
|
class: 'form-control',
|
||||||
|
placeholder: t('admin.tools.wordcloud.special_terms_placeholder') %>
|
||||||
|
<small class="text-muted"><%= t('admin.tools.wordcloud.special_terms_help') %></small>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="form-group">
|
||||||
|
<%= label_tag :additional_stopwords, t('admin.tools.wordcloud.additional_stopwords') %>
|
||||||
|
<%= text_area_tag :additional_stopwords, @config['additional_stopwords'].is_a?(Array) ? @config['additional_stopwords'].join(', ') : '',
|
||||||
|
rows: 3,
|
||||||
|
placeholder: t('admin.tools.wordcloud.stopwords_placeholder'),
|
||||||
|
class: 'form-control' %>
|
||||||
|
<small class="text-muted"><%= t('admin.tools.wordcloud.stopwords_help') %></small>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="form-group">
|
||||||
|
<%= label_tag :additional_prompt, t('admin.tools.wordcloud.additional_prompt') %>
|
||||||
|
<%= text_area_tag :additional_prompt, @config['additional_prompt'], class: "form-control", rows: 3,
|
||||||
|
placeholder: t('admin.tools.wordcloud.additional_prompt_placeholder') %>
|
||||||
|
<small class="form-text text-muted"><%= t('admin.tools.wordcloud.additional_prompt_help') %></small>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<%= submit_tag t('admin.tools.generate_wordcloud'), class: 'btn btn-primary btn-lg mt-3' %>
|
||||||
|
<% end %>
|
92
app/views/admin/tools/wordcloud/index.html.erb
Normal file
92
app/views/admin/tools/wordcloud/index.html.erb
Normal file
|
@ -0,0 +1,92 @@
|
||||||
|
<% content_for :actions do %>
|
||||||
|
<%= link_to t('back'), admin_tools_path, class: 'btn btn-default' %>
|
||||||
|
<% end %>
|
||||||
|
<%= render "shared/title", name: t('admin.tools.wordcloud.title') %>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
.wordcloud-container {
|
||||||
|
margin-bottom: 20px;
|
||||||
|
}
|
||||||
|
.controls-section {
|
||||||
|
padding-top: 15px;
|
||||||
|
margin-top: 15px;
|
||||||
|
border-top: 1px solid #eee;
|
||||||
|
}
|
||||||
|
.instructions {
|
||||||
|
margin-bottom: 15px;
|
||||||
|
color: #555;
|
||||||
|
}
|
||||||
|
.mt-2 {
|
||||||
|
margin-top: 10px;
|
||||||
|
}
|
||||||
|
.wordcloud-container a {
|
||||||
|
display: block;
|
||||||
|
text-decoration: none;
|
||||||
|
padding: 5px;
|
||||||
|
border: 1px solid transparent;
|
||||||
|
transition: all 0.2s ease;
|
||||||
|
}
|
||||||
|
.wordcloud-container a:hover {
|
||||||
|
border-color: #ddd;
|
||||||
|
background-color: #f9f9f9;
|
||||||
|
border-radius: 4px;
|
||||||
|
}
|
||||||
|
.wordcloud-container a small {
|
||||||
|
color: #337ab7;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-md-12">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-md-8">
|
||||||
|
<div class="panel panel-default">
|
||||||
|
<div class="panel-heading">
|
||||||
|
<h3 class="panel-title"><%= t('admin.tools.wordcloud.title') %></h3>
|
||||||
|
</div>
|
||||||
|
<div class="panel-body text-center">
|
||||||
|
<% if @wordcloud_url %>
|
||||||
|
<div class="wordcloud-container">
|
||||||
|
<%= link_to @wordcloud_url, target: "_blank", title: t('admin.tools.wordcloud.view_full_size') do %>
|
||||||
|
<%= image_tag @wordcloud_url, class: 'img-responsive', alt: t('admin.tools.wordcloud.title') %>
|
||||||
|
<div class="text-center mt-2">
|
||||||
|
<small><i class="fa fa-search-plus"></i> <%= t('admin.tools.wordcloud.click_to_enlarge') %></small>
|
||||||
|
</div>
|
||||||
|
<% end %>
|
||||||
|
|
||||||
|
<% if @wordcloud_generated_at %>
|
||||||
|
<div class="text-muted mt-2">
|
||||||
|
<small><i class="fa fa-clock-o"></i> <%= t('admin.tools.wordcloud.generated_at', time: l(@wordcloud_generated_at, format: :long)) %></small>
|
||||||
|
</div>
|
||||||
|
<% end %>
|
||||||
|
</div>
|
||||||
|
<% end %>
|
||||||
|
<div class="instructions">
|
||||||
|
<p><%= t('admin.tools.wordcloud.instructions') %></p>
|
||||||
|
</div>
|
||||||
|
<%= render 'admin/tools/wordcloud/form' %>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-md-4">
|
||||||
|
<div class="panel panel-default">
|
||||||
|
<div class="panel-heading">
|
||||||
|
<h3 class="panel-title"><%= t('admin.tools.wordcloud.top_words') %></h3>
|
||||||
|
</div>
|
||||||
|
<div class="panel-body">
|
||||||
|
<% if @top_words && @top_words.any? %>
|
||||||
|
<ol>
|
||||||
|
<% @top_words.each do |word, count| %>
|
||||||
|
<li><strong><%= word %></strong>: <%= count %></li>
|
||||||
|
<% end %>
|
||||||
|
</ol>
|
||||||
|
<% else %>
|
||||||
|
<p class="text-muted"><%= t('admin.tools.wordcloud.top_words_empty') %></p>
|
||||||
|
<% end %>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
93
app/views/admin/tools/wordcloud/progress.html.erb
Normal file
93
app/views/admin/tools/wordcloud/progress.html.erb
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
<div class="container-fluid">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-12">
|
||||||
|
<h1>WordCloud Generation Progress</h1>
|
||||||
|
|
||||||
|
<div class="card mb-4">
|
||||||
|
<div class="card-body">
|
||||||
|
<div class="progress mb-3">
|
||||||
|
<div id="progress-bar" class="progress-bar" role="progressbar" style="width: <%= @progress_data[:progress] %>%;"
|
||||||
|
aria-valuenow="<%= @progress_data[:progress] %>" aria-valuemin="0" aria-valuemax="100">
|
||||||
|
<%= @progress_data[:progress] %>%
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div id="status-message">
|
||||||
|
<% case @progress_data[:status] %>
|
||||||
|
<% when 'not_started' %>
|
||||||
|
<div class="alert alert-info">Waiting to start processing...</div>
|
||||||
|
<% when 'processing' %>
|
||||||
|
<div class="alert alert-info">Processing in progress...</div>
|
||||||
|
<% when 'completed' %>
|
||||||
|
<div class="alert alert-success">
|
||||||
|
WordCloud generation completed!
|
||||||
|
<%= link_to "View WordCloud", admin_tools_wordcloud_path, class: "btn btn-primary" %>
|
||||||
|
</div>
|
||||||
|
<% when 'failed' %>
|
||||||
|
<div class="alert alert-danger">
|
||||||
|
Error: <%= @progress_data[:error] || "Unknown error occurred" %>
|
||||||
|
</div>
|
||||||
|
<% end %>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="mt-3">
|
||||||
|
<%= link_to "Back to Tools", admin_tools_path, class: "btn btn-secondary" %>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
document.addEventListener('DOMContentLoaded', function() {
|
||||||
|
// Only poll if the process is not completed or failed
|
||||||
|
if ("<%= @progress_data[:status] %>" !== "completed" && "<%= @progress_data[:status] %>" !== "failed") {
|
||||||
|
pollProgress();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
function pollProgress() {
|
||||||
|
const progressBar = document.getElementById('progress-bar');
|
||||||
|
const statusMessage = document.getElementById('status-message');
|
||||||
|
|
||||||
|
// Poll every 2 seconds
|
||||||
|
setInterval(function() {
|
||||||
|
fetch('<%= status_admin_tools_wordcloud_path %>')
|
||||||
|
.then(response => response.json())
|
||||||
|
.then(data => {
|
||||||
|
// Update progress bar
|
||||||
|
progressBar.style.width = data.progress + '%';
|
||||||
|
progressBar.setAttribute('aria-valuenow', data.progress);
|
||||||
|
progressBar.textContent = data.progress + '%';
|
||||||
|
|
||||||
|
// Update status message
|
||||||
|
let statusHtml = '';
|
||||||
|
switch(data.status) {
|
||||||
|
case 'not_started':
|
||||||
|
statusHtml = '<div class="alert alert-info">Waiting to start processing...</div>';
|
||||||
|
break;
|
||||||
|
case 'processing':
|
||||||
|
statusHtml = '<div class="alert alert-info">Processing in progress...</div>';
|
||||||
|
break;
|
||||||
|
case 'completed':
|
||||||
|
statusHtml = '<div class="alert alert-success">WordCloud generation completed! ' +
|
||||||
|
'<a href="<%= admin_tools_wordcloud_path %>">View WordCloud</a></div>';
|
||||||
|
// Redirect after a short delay
|
||||||
|
setTimeout(function() {
|
||||||
|
window.location.href = '<%= admin_tools_wordcloud_path %>';
|
||||||
|
}, 2000);
|
||||||
|
break;
|
||||||
|
case 'failed':
|
||||||
|
statusHtml = '<div class="alert alert-danger">Error: ' +
|
||||||
|
(data.error || "Unknown error occurred") + '</div>';
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
statusMessage.innerHTML = statusHtml;
|
||||||
|
})
|
||||||
|
.catch(error => {
|
||||||
|
console.error('Error polling progress:', error);
|
||||||
|
});
|
||||||
|
}, 2000);
|
||||||
|
}
|
||||||
|
</script>
|
|
@ -54,6 +54,10 @@ Rails.application.configure do
|
||||||
|
|
||||||
# Use a different cache store in production.
|
# Use a different cache store in production.
|
||||||
# config.cache_store = :mem_cache_store
|
# config.cache_store = :mem_cache_store
|
||||||
|
config.cache_store = :redis_cache_store, {
|
||||||
|
url: "#{ENV.fetch('REDIS_URL', 'redis://localhost:6379')}/1",
|
||||||
|
expires_in: 300.seconds
|
||||||
|
}
|
||||||
|
|
||||||
# Use a real queuing backend for Active Job (and separate queues per environment)
|
# Use a real queuing backend for Active Job (and separate queues per environment)
|
||||||
config.active_job.queue_adapter = :sidekiq
|
config.active_job.queue_adapter = :sidekiq
|
||||||
|
|
|
@ -2,7 +2,7 @@ require 'sidekiq/web' # Require at the top of the initializer
|
||||||
|
|
||||||
Sidekiq.configure_server do |config|
|
Sidekiq.configure_server do |config|
|
||||||
config.logger.level = Logger::INFO
|
config.logger.level = Logger::INFO
|
||||||
|
|
||||||
# Custom job logging format
|
# Custom job logging format
|
||||||
Sidekiq.logger.formatter = proc do |severity, datetime, progname, msg|
|
Sidekiq.logger.formatter = proc do |severity, datetime, progname, msg|
|
||||||
thread_id = Thread.current.object_id.to_s(36)
|
thread_id = Thread.current.object_id.to_s(36)
|
||||||
|
|
42
config/locales/admin/tools.en.yml
Normal file
42
config/locales/admin/tools.en.yml
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
en:
|
||||||
|
admin:
|
||||||
|
tools:
|
||||||
|
title: "Administrative Tools"
|
||||||
|
available_tools: "Available Tools"
|
||||||
|
wordcloud_generator: "Domain Name Wordcloud Generator"
|
||||||
|
wordcloud_generator_description: "Generate a visual wordcloud from domain names in the registry"
|
||||||
|
generate_wordcloud: "Generate Wordcloud"
|
||||||
|
wordcloud:
|
||||||
|
title: "Domain Name Wordcloud"
|
||||||
|
success: "Wordcloud generated successfully"
|
||||||
|
error: "Error generating wordcloud"
|
||||||
|
processing: "Processing domain names. This may take a few minutes..."
|
||||||
|
instructions: "Generate a visual representation of the most common words in domain names. Click the button below to create the wordcloud."
|
||||||
|
top_words: "Top Words"
|
||||||
|
top_words_empty: "Generate a wordcloud to see the most frequent words."
|
||||||
|
click_to_enlarge: "Click to enlarge"
|
||||||
|
view_full_size: "View full size wordcloud image"
|
||||||
|
use_custom_domains: "Use custom domain list"
|
||||||
|
custom_file_description: "Upload a CSV file with one domain name per line"
|
||||||
|
file_upload_error: "Error processing uploaded file"
|
||||||
|
file_optional: "If no file is uploaded, all active domains in the registry will be used"
|
||||||
|
generated_at: "Generated at %{time}"
|
||||||
|
no_file: "No domain names found"
|
||||||
|
width: "Width"
|
||||||
|
height: "Height"
|
||||||
|
max_words: "Max Words"
|
||||||
|
background: "Background"
|
||||||
|
additional_stopwords: "Stopwords"
|
||||||
|
stopwords_placeholder: "Enter additional stopwords, one per line"
|
||||||
|
stopwords_help: "Stopwords are words that will not be included in the wordcloud"
|
||||||
|
advanced_options: "Advanced Options"
|
||||||
|
min_word_length: "Min Word Length"
|
||||||
|
include_numbers: "Include Numbers"
|
||||||
|
special_terms: "Special Terms"
|
||||||
|
special_terms_placeholder: "e.g., e-, i-, .com, ai, web"
|
||||||
|
special_terms_help: "These terms will be preserved in the word cloud even if they would normally be filtered out"
|
||||||
|
batch_size: "Batch Size"
|
||||||
|
batch_size_help: "Number of domains to process in each API call."
|
||||||
|
additional_prompt: "Additional Prompt Text"
|
||||||
|
additional_prompt_placeholder: "Add any additional instructions for the word cloud generation here..."
|
||||||
|
additional_prompt_help: "Optional text that will be used as additional context during word cloud generation."
|
|
@ -229,6 +229,7 @@ en:
|
||||||
valid_from: 'Valid from'
|
valid_from: 'Valid from'
|
||||||
general: 'General'
|
general: 'General'
|
||||||
contacts: 'Contacts'
|
contacts: 'Contacts'
|
||||||
|
tools: 'Tools'
|
||||||
identity_code: 'Identity code'
|
identity_code: 'Identity code'
|
||||||
nameservers: 'Nameservers'
|
nameservers: 'Nameservers'
|
||||||
hostname: 'Hostname'
|
hostname: 'Hostname'
|
||||||
|
|
|
@ -250,6 +250,17 @@ Rails.application.routes.draw do
|
||||||
end
|
end
|
||||||
# post 'admi/upload_spreadsheet', to: 'customers#upload_spreadsheet', as: :customers_upload_spreadsheet
|
# post 'admi/upload_spreadsheet', to: 'customers#upload_spreadsheet', as: :customers_upload_spreadsheet
|
||||||
|
|
||||||
|
resources :tools, only: %i[index]
|
||||||
|
|
||||||
|
namespace :tools do
|
||||||
|
resource :wordcloud, controller: 'wordcloud', only: %i[create] do
|
||||||
|
collection do
|
||||||
|
get '', to: 'wordcloud#index'
|
||||||
|
get 'progress', to: 'wordcloud#progress', as: :progress
|
||||||
|
get 'status', to: 'wordcloud#status', as: :status
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
resources :bank_statements do
|
resources :bank_statements do
|
||||||
resources :bank_transactions
|
resources :bank_transactions
|
||||||
|
|
BIN
lib/wordcloud/fonts/Pacifico-Regular.ttf
Normal file
BIN
lib/wordcloud/fonts/Pacifico-Regular.ttf
Normal file
Binary file not shown.
351
lib/wordcloud/generate_wordcloud.py
Normal file
351
lib/wordcloud/generate_wordcloud.py
Normal file
|
@ -0,0 +1,351 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
import asyncio
|
||||||
|
import numpy as np
|
||||||
|
from PIL import Image
|
||||||
|
from os import path
|
||||||
|
from wordcloud import WordCloud, STOPWORDS
|
||||||
|
from openai import AsyncOpenAI
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
# import pandas as pd
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
BATCH_SIZE = int(os.environ.get("OPENAI_BATCH_SIZE", "20"))
|
||||||
|
|
||||||
|
def load_system_prompt():
|
||||||
|
"""Loads system prompt from system_prompt.md file"""
|
||||||
|
prompt_file = path.join(path.dirname(__file__), 'system_prompt.md')
|
||||||
|
|
||||||
|
if not path.exists(prompt_file):
|
||||||
|
raise FileNotFoundError(f"System prompt not found at {prompt_file}. Please create the file.")
|
||||||
|
|
||||||
|
with open(prompt_file, 'r', encoding='utf-8') as f:
|
||||||
|
system_prompt = f.read()
|
||||||
|
|
||||||
|
return system_prompt
|
||||||
|
|
||||||
|
d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
|
||||||
|
|
||||||
|
output_dir = sys.argv[2] if len(sys.argv) > 2 else d
|
||||||
|
|
||||||
|
try:
|
||||||
|
SYSTEM_PROMPT = load_system_prompt()
|
||||||
|
print("System prompt successfully loaded from file.")
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
print(f"Error: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Load configuration if provided
|
||||||
|
config = {}
|
||||||
|
if len(sys.argv) > 3 and sys.argv[3]:
|
||||||
|
config_file = sys.argv[3]
|
||||||
|
if path.exists(config_file):
|
||||||
|
with open(config_file, 'r') as f:
|
||||||
|
config = json.load(f)
|
||||||
|
print(f"Loaded configuration: {config}")
|
||||||
|
|
||||||
|
# Check if domains file path is provided and exists
|
||||||
|
if len(sys.argv) > 1 and sys.argv[1]:
|
||||||
|
domains_file = sys.argv[1]
|
||||||
|
if not path.exists(domains_file):
|
||||||
|
print(f"Error: Provided domains file {domains_file} not found")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
print(f"Error: Domains file not found")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Read domain names from the file
|
||||||
|
with open(domains_file, 'r', encoding='utf-8') as f:
|
||||||
|
domain_names = [line.strip().lower() for line in f if line.strip()]
|
||||||
|
|
||||||
|
if not domain_names:
|
||||||
|
print("Error: No domain names found in the provided file")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
# Function to extract words using OpenAI API asynchronously
|
||||||
|
async def extract_words_with_openai(domain_names, batch_size=BATCH_SIZE):
|
||||||
|
filtered_domains = []
|
||||||
|
|
||||||
|
# Filter out domains that are only numbers
|
||||||
|
for domain in domain_names:
|
||||||
|
domain_core = domain.lower().replace('www.', '')
|
||||||
|
main_part = domain_core.split('.')[0]
|
||||||
|
if not main_part.isdigit():
|
||||||
|
filtered_domains.append(domain)
|
||||||
|
|
||||||
|
|
||||||
|
# Get API key from environment variable
|
||||||
|
api_key = os.environ.get("OPENAI_API_KEY")
|
||||||
|
if not api_key:
|
||||||
|
raise ValueError("OpenAI API key not found. Set the OPENAI_API_KEY environment variable.")
|
||||||
|
|
||||||
|
# Initialize AsyncOpenAI client
|
||||||
|
client = AsyncOpenAI(api_key=api_key)
|
||||||
|
|
||||||
|
# Get model and temperature from environment variables
|
||||||
|
model = os.environ.get("OPENAI_MODEL", "gpt-4o-2024-11-20")
|
||||||
|
temperature = float(os.environ.get("OPENAI_TEMPERATURE", "0"))
|
||||||
|
max_tokens = int(os.environ.get("OPENAI_MAX_TOKENS", "16000"))
|
||||||
|
|
||||||
|
# Process domains in batches
|
||||||
|
all_words = []
|
||||||
|
total_prompt_tokens = 0
|
||||||
|
total_completion_tokens = 0
|
||||||
|
total_cost = 0
|
||||||
|
|
||||||
|
# Calculate number of batches
|
||||||
|
num_batches = (len(filtered_domains) + batch_size - 1) // batch_size
|
||||||
|
|
||||||
|
# Create semaphore to limit concurrent requests
|
||||||
|
semaphore = asyncio.Semaphore(10) # Limit to 10 concurrent requests
|
||||||
|
|
||||||
|
async def process_batch(batch_idx):
|
||||||
|
async with semaphore:
|
||||||
|
start_idx = batch_idx * batch_size
|
||||||
|
end_idx = min(start_idx + batch_size, len(filtered_domains))
|
||||||
|
batch = filtered_domains[start_idx:end_idx]
|
||||||
|
|
||||||
|
print(f"Processing batch {batch_idx + 1}/{num_batches} ({len(batch)} domains)...")
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
# Prepare the prompt with domain names and special terms
|
||||||
|
domains_text = "\n".join(batch)
|
||||||
|
prompt = f"List of domain names: {domains_text}"
|
||||||
|
|
||||||
|
# Make the API call
|
||||||
|
try:
|
||||||
|
print(f"Using model: {model} with temperature: {temperature}")
|
||||||
|
response = await client.chat.completions.create(
|
||||||
|
model=model,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": prompt}
|
||||||
|
],
|
||||||
|
response_format={
|
||||||
|
"type": "json_schema",
|
||||||
|
"json_schema": {
|
||||||
|
"name": "domain_analysis_results",
|
||||||
|
"strict": True,
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"results": {
|
||||||
|
"type": "array",
|
||||||
|
"description": "A list of analysis results for the provided domains.",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"Language": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The language identified in the domain name."
|
||||||
|
},
|
||||||
|
"is_splitted": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Indicates whether the domain name is split into recognizable words."
|
||||||
|
},
|
||||||
|
"reasoning": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Explanation of the reasoning behind the language and word identification."
|
||||||
|
},
|
||||||
|
"words": {
|
||||||
|
"type": "array",
|
||||||
|
"description": "The words identified in the domain name.",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"Language",
|
||||||
|
"is_splitted",
|
||||||
|
"reasoning",
|
||||||
|
"words"
|
||||||
|
],
|
||||||
|
"additionalProperties": False
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"results"
|
||||||
|
],
|
||||||
|
"additionalProperties": False
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
temperature=temperature,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Track token usage
|
||||||
|
prompt_tokens = response.usage.prompt_tokens
|
||||||
|
completion_tokens = response.usage.completion_tokens
|
||||||
|
total_tokens = response.usage.total_tokens
|
||||||
|
|
||||||
|
nonlocal total_prompt_tokens, total_completion_tokens
|
||||||
|
total_prompt_tokens += prompt_tokens
|
||||||
|
total_completion_tokens += completion_tokens
|
||||||
|
|
||||||
|
print(f"Token usage - Prompt: {prompt_tokens}, Completion: {completion_tokens}, Total: {total_tokens}")
|
||||||
|
|
||||||
|
# Calculate cost (approximate, based on current pricing)
|
||||||
|
if "gpt-4.1" in model:
|
||||||
|
prompt_cost = (prompt_tokens / 1000000) * 2.00 # $2.00 per 1M tokens for GPT-4.1 input
|
||||||
|
completion_cost = (completion_tokens / 1000000) * 8.00 # $8.00 per 1M tokens for GPT-4.1 output
|
||||||
|
else:
|
||||||
|
prompt_cost = 0
|
||||||
|
completion_cost = 0
|
||||||
|
|
||||||
|
batch_cost = prompt_cost + completion_cost
|
||||||
|
nonlocal total_cost
|
||||||
|
total_cost += batch_cost
|
||||||
|
print(f"Estimated batch cost: ${batch_cost:.6f}")
|
||||||
|
|
||||||
|
# Extract the words from the response
|
||||||
|
response_json = json.loads(response.choices[0].message.content)
|
||||||
|
batch_words = []
|
||||||
|
for result in response_json['results']:
|
||||||
|
if result['Language'] == 'Ignore':
|
||||||
|
continue
|
||||||
|
batch_words.extend(result['words'])
|
||||||
|
|
||||||
|
print(f"Extracted {len(batch_words)} words from this batch")
|
||||||
|
return batch_words
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error calling OpenAI API for batch: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Create tasks for each batch
|
||||||
|
tasks = []
|
||||||
|
for batch_idx in range(num_batches):
|
||||||
|
tasks.append(process_batch(batch_idx))
|
||||||
|
|
||||||
|
# Run all tasks concurrently and wait for results
|
||||||
|
batch_results = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
# Combine all words from all batches
|
||||||
|
for batch_words in batch_results:
|
||||||
|
all_words.extend(batch_words)
|
||||||
|
|
||||||
|
print(f"Total token usage - Prompt: {total_prompt_tokens}, Completion: {total_completion_tokens}")
|
||||||
|
print(f"Total estimated cost: ${total_cost:.6f}")
|
||||||
|
|
||||||
|
return all_words
|
||||||
|
|
||||||
|
# Replace the synchronous call with an async function
|
||||||
|
async def main():
|
||||||
|
# Process domain names using OpenAI
|
||||||
|
print("Extracting words from domain names using OpenAI...")
|
||||||
|
extracted_words = await extract_words_with_openai(domain_names)
|
||||||
|
print(f"Extracted {len(extracted_words)} words")
|
||||||
|
|
||||||
|
# Join the extracted words for the word cloud
|
||||||
|
processed_text = ' '.join(extracted_words)
|
||||||
|
|
||||||
|
def custom_color_func(word, font_size, position, orientation, random_state=None,
|
||||||
|
**kwargs):
|
||||||
|
return "hsl(215, 100%%, %d%%)" % random.randint(15, 80)
|
||||||
|
|
||||||
|
mask = np.array(Image.open(path.join(d, 'mask.png')))
|
||||||
|
|
||||||
|
# Get configuration values with defaults
|
||||||
|
width = int(config.get('width', 800))
|
||||||
|
height = int(config.get('height', 800))
|
||||||
|
max_words = int(config.get('max_words', 500))
|
||||||
|
background_color = config.get('background_color', 'white')
|
||||||
|
min_word_length = int(config.get('min_word_length', 2))
|
||||||
|
include_numbers = config.get('include_numbers', True)
|
||||||
|
|
||||||
|
# Handle transparent background
|
||||||
|
if background_color == 'transparent':
|
||||||
|
background_color = None
|
||||||
|
|
||||||
|
# Get additional stopwords
|
||||||
|
additional_stopwords = config.get('additional_stopwords', [])
|
||||||
|
|
||||||
|
stopwords = set(STOPWORDS)
|
||||||
|
stopwords = {
|
||||||
|
'ja', 'ning', 'et', 'kui', 'aga', 'ka', 'ei', 'see', 'on', 'ole',
|
||||||
|
'oma', 'seda', 'siis', 'või', 'mis', 'nii', 'veel', 'kes', 'üle',
|
||||||
|
'välja', 'olema', 'kus', 'nagu', 'kuid', 'selle', 'pole', 'ära',
|
||||||
|
'vaid', 'sest', 'juba', 'meie', 'mida', 'need', 'olid', 'minu',
|
||||||
|
'tema', 'pärast', 'mingi', 'palju', 'kõik', 'seal', 'olen', 'oled',
|
||||||
|
'oli', 'olnud', 'ongi', 'poolt', 'meil', 'teda', 'just', 'kuna',
|
||||||
|
'läbi', 'küll',
|
||||||
|
'the', 'and', 'a', 'to', 'of', 'in', 'is', 'that', 'it', 'for',
|
||||||
|
'with', 'as', 'be', 'on', 'not', 'this', 'but', 'by', 'from', 'are',
|
||||||
|
'or', 'an', 'at', 'was', 'have', 'has', 'had', 'were', 'will', 'would',
|
||||||
|
'should', 'can', 'could', 'may', 'might', 'must', 'do', 'does', 'did',
|
||||||
|
'doing', 'done', 'their', 'they', 'them', 'there', 'these', 'those',
|
||||||
|
'which', 'who', 'whom', 'whose', 'what', 'when', 'where', 'why', 'how'
|
||||||
|
}
|
||||||
|
|
||||||
|
stopwords.update(stopwords)
|
||||||
|
stopwords.update(additional_stopwords)
|
||||||
|
|
||||||
|
font_path = path.join(d, 'fonts', 'Pacifico-Regular.ttf')
|
||||||
|
# Alternative: use a system font
|
||||||
|
# font_path = fm.findfont(fm.FontProperties(family='Arial'))
|
||||||
|
|
||||||
|
print("Generating word cloud...")
|
||||||
|
wc = WordCloud(width=width, height=height,
|
||||||
|
mask=mask,
|
||||||
|
stopwords=stopwords,
|
||||||
|
background_color=background_color,
|
||||||
|
max_words=max_words,
|
||||||
|
include_numbers=include_numbers,
|
||||||
|
collocations=False,
|
||||||
|
min_word_length=min_word_length,
|
||||||
|
regexp=r"[A-Za-zÕÄÖÜõäöü0-9][\w\-'ÕÄÖÜõäöü]*(?<!\.ee)(?<!ee)",
|
||||||
|
font_path=font_path)
|
||||||
|
|
||||||
|
wc.generate(processed_text)
|
||||||
|
|
||||||
|
# Get word frequencies from the word cloud
|
||||||
|
word_frequencies = wc.process_text(processed_text)
|
||||||
|
# Remove stopwords from the frequencies
|
||||||
|
word_frequencies = {word: freq for word, freq in word_frequencies.items()
|
||||||
|
if word.lower() not in stopwords}
|
||||||
|
|
||||||
|
# Sort words by frequency (highest first)
|
||||||
|
sorted_words = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
# Get top 10 words
|
||||||
|
top_10_words = sorted_words[:10]
|
||||||
|
|
||||||
|
# Print top 10 words to console
|
||||||
|
print("\nTop 10 most frequent words:")
|
||||||
|
for word, freq in top_10_words:
|
||||||
|
print(f"{word}: {freq}")
|
||||||
|
|
||||||
|
# Save top 10 words to a text file
|
||||||
|
top_words_file = path.join(output_dir, 'top_words.txt')
|
||||||
|
with open(top_words_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.write("Top 10 most frequent words:\n")
|
||||||
|
for i, (word, freq) in enumerate(top_10_words, 1):
|
||||||
|
f.write(f"{i}. {word}: {freq}\n")
|
||||||
|
|
||||||
|
print(f"\nTop words saved to {top_words_file}")
|
||||||
|
|
||||||
|
# store default colored image
|
||||||
|
default_colors = wc.to_array()
|
||||||
|
# Display the word cloud
|
||||||
|
plt.imshow(wc.recolor(color_func=custom_color_func, random_state=3),
|
||||||
|
interpolation="bilinear")
|
||||||
|
plt.axis('off')
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
# Save the word cloud to file
|
||||||
|
wc.to_file(path.join(output_dir, 'wordcloud.png'))
|
||||||
|
|
||||||
|
# Call the async main function
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Run the async main function
|
||||||
|
asyncio.run(main())
|
BIN
lib/wordcloud/mask.png
Normal file
BIN
lib/wordcloud/mask.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 52 KiB |
89
lib/wordcloud/system_prompt.md
Normal file
89
lib/wordcloud/system_prompt.md
Normal file
|
@ -0,0 +1,89 @@
|
||||||
|
You are a bilinear Estonian-English linguist and word-segmentation expert.
|
||||||
|
Your task is to identify which word or words a domain name consists of. You only work with English and Estonian words.
|
||||||
|
|
||||||
|
### INSTRUCTION
|
||||||
|
**Key “Language”**
|
||||||
|
You must determine the language of the domain name. The domain name can be a single word or several words. You have 3 options: Estonian, English, Ignore.
|
||||||
|
- Ignore the protocol, the leading “www.” sub-domain (if present) and the top-level domain (e.g. “.ee”, “.com”) – they never influence language detection.
|
||||||
|
- If the domain consists of numbers, random letters, abbreviations, personal names, or is a transliteration from another language (for example, mnogoknig.ee from Russian), you should choose “Ignore” for Language.
|
||||||
|
- Otherwise, use a longest-match left-to-right lookup against (1) an Estonian core-vocabulary list, (2) a general English dictionary, (3) a whitelist of well-known abbreviations such as BMW, CAD, NGO, AI, EE. Whichever language supplies the majority of matched tokens becomes the value of Language.
|
||||||
|
- When tokens from both languages are present in roughly equal measure, choose the language that appears first in the domain string.
|
||||||
|
|
||||||
|
**Key “is_splitted”**
|
||||||
|
Here you must specify whether the domain name consists of more than one word.
|
||||||
|
- Treat a digit boundary (letter → digit or digit → letter) as an automatic split; the digit itself counts as a separate token.
|
||||||
|
- Treat a change of language (Estonian token followed by English token, or vice versa) as a split.
|
||||||
|
- Hyphens “-” or underscores “_” (even though rare in .ee domains) are explicit boundaries.
|
||||||
|
- Even if the domain includes an Estonian word plus an abbreviation, acronym or number, you still set “is_splitted” to true.
|
||||||
|
|
||||||
|
**Key “reasoning”**
|
||||||
|
Here, you should reason about which exact words and abbreviations make up the domain name.
|
||||||
|
- Work left → right, applying longest-match dictionary look-ups; if no match is possible and the fragment is ≤ 3 letters, treat it as an abbreviation; if it is longer, treat it as nonsense and set Language = Ignore.
|
||||||
|
- When you recognise an Estonian morphological ending (-id, -ed, -us, -ja, -jad, -te), peel it off and explain the root plus ending in the reasoning.
|
||||||
|
- If Language is Ignore, simply write “Ignore”. Otherwise, for every recognised word, abbreviation, symbol or number give a short definition or plausible meaning.
|
||||||
|
|
||||||
|
**Key “words”**
|
||||||
|
Based on the reasoning above, list only the words and tokens that make up the domain, in the order they appear.
|
||||||
|
- Omit “www”, TLDs and any punctuation.
|
||||||
|
- Keep digits as separate tokens (e.g. auto24.ee → “auto”, “24”).
|
||||||
|
- For fragments treated as abbreviations include the abbreviation exactly as it appears (“BMW”, “CAD”).
|
||||||
|
- If Language = Ignore, leave the array empty.
|
||||||
|
|
||||||
|
### EXAMPLES OF SPLITTING WORDS:
|
||||||
|
advanceautokool.ee: advance, auto, kool
|
||||||
|
1autosuvila.ee: auto, suvila
|
||||||
|
autoaks.ee: auto
|
||||||
|
autoeis.ee: auto
|
||||||
|
autoklaasitehnik.ee: auto, klaas, tehnik
|
||||||
|
autokoolmegalinn.ee: auto, kool, mega, linn
|
||||||
|
autoly.ee: auto
|
||||||
|
automatiseeri.ee: auto
|
||||||
|
autonova.ee: auto, nova
|
||||||
|
autor.ee: autor
|
||||||
|
autost24.ee: Auto, 24
|
||||||
|
eestiaiandus.ee: eesti, aiandus
|
||||||
|
eestiastelpaju.ee: eesti, astelpaju
|
||||||
|
eestiloomekoda.ee: eesti, loomekoda
|
||||||
|
eestimadrats.ee: eesti, madrats
|
||||||
|
eestiost.ee: eesti, ost
|
||||||
|
eestipinglaed.ee: eesti, pinglaed
|
||||||
|
eestirohelineelu.ee: eesti, roheline, elu
|
||||||
|
eestiterviseuudised.ee: eesti, tervise, uudised
|
||||||
|
eheeesti.ee: ehe, eesti
|
||||||
|
ehitusliiv.ee: ehitus, liiv
|
||||||
|
ehitusgeodeesia.ee: ehitus, geodeesia
|
||||||
|
ehitusakadeemia.ee: ehitus, akadeemia
|
||||||
|
ehitusoutlet1.ee: ehitus, outlet
|
||||||
|
enpeehitus.ee: ehitus
|
||||||
|
eramuteehitus.ee: eramu, ehitus
|
||||||
|
fstehitus.ee: ehitus
|
||||||
|
hkehitusekspertiisid.ee: ehitus, ekspert
|
||||||
|
kronestehitus.ee: est, ehitus
|
||||||
|
makeehituspartner.ee: make, ehitus, partner
|
||||||
|
masirent.ee: rent
|
||||||
|
montessorirent.ee: montessoor, rent
|
||||||
|
paadirent1.ee: paadi, rent
|
||||||
|
pakiautorent.ee: paki, auto, rent
|
||||||
|
pixover.ee: pix, over
|
||||||
|
pixrent.ee: pix, rent
|
||||||
|
rentafriend.ee: rent, friend
|
||||||
|
rentbmw.ee: rent, bmw
|
||||||
|
reservrent.ee: reserv, rent
|
||||||
|
rentellix.ee: rent, ellix?
|
||||||
|
valmismajad.ee: valmis, maja
|
||||||
|
eramajadehooldus.ee: eramaja, hooldus
|
||||||
|
mastimajad.ee: mast, maja
|
||||||
|
nupsikpood.ee: nupsik, pood
|
||||||
|
poodcolordeco.ee: pood, color, deco
|
||||||
|
tarantlipood.ee: tarantli, pood
|
||||||
|
alyanstorupood.ee: toru, pood
|
||||||
|
arriumtech.ee: arrium, tech
|
||||||
|
xeniustech.ee: xenius, tech
|
||||||
|
whitechem.ee: white, chem
|
||||||
|
techme.ee: tech, me
|
||||||
|
techcad.ee: tech, cad
|
||||||
|
estonianharbours.ee: estonia, harbour
|
||||||
|
estonianspl.ee: estonia
|
||||||
|
hauratonestonia.ee: hauraton, estonia
|
||||||
|
koerahoidjatartus.ee: koer, hoidja, tartu
|
||||||
|
terrassidtartus.ee: terrass, tartu
|
1
public/wordcloud/config.json
Normal file
1
public/wordcloud/config.json
Normal file
|
@ -0,0 +1 @@
|
||||||
|
{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","additional_prompt":null,"special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]}
|
11
public/wordcloud/top_words.txt
Normal file
11
public/wordcloud/top_words.txt
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
Top 10 most frequent words:
|
||||||
|
1. auto: 80
|
||||||
|
2. eesti: 65
|
||||||
|
3. 24: 62
|
||||||
|
4. ehitus: 43
|
||||||
|
5. rent: 36
|
||||||
|
6. shop: 34
|
||||||
|
7. estonia: 30
|
||||||
|
8. pood: 27
|
||||||
|
9. tech: 27
|
||||||
|
10. tartu: 24
|
BIN
public/wordcloud/wordcloud.png
Normal file
BIN
public/wordcloud/wordcloud.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.2 MiB |
1
public/wordcloud/wordcloud_config_1747745307.json
Normal file
1
public/wordcloud/wordcloud_config_1747745307.json
Normal file
|
@ -0,0 +1 @@
|
||||||
|
{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","additional_prompt":null,"special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]}
|
1
public/wordcloud/wordcloud_config_1747745435.json
Normal file
1
public/wordcloud/wordcloud_config_1747745435.json
Normal file
|
@ -0,0 +1 @@
|
||||||
|
{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","additional_prompt":null,"special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]}
|
1
public/wordcloud/wordcloud_config_1747831231.json
Normal file
1
public/wordcloud/wordcloud_config_1747831231.json
Normal file
|
@ -0,0 +1 @@
|
||||||
|
{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","additional_prompt":null,"special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]}
|
1
public/wordcloud/wordcloud_config_1747907076.json
Normal file
1
public/wordcloud/wordcloud_config_1747907076.json
Normal file
|
@ -0,0 +1 @@
|
||||||
|
{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","additional_prompt":null,"special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]}
|
Loading…
Add table
Add a link
Reference in a new issue