diff --git a/Dockerfile b/Dockerfile index bbfac955e..5c27b4f0b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -62,6 +62,16 @@ RUN apt-get install -y --no-install-recommends > /dev/null \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* +# Install Python packages for wordcloud generation +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3-pip \ + python3-setuptools \ + python3-dev \ + && pip3 install --upgrade pip setuptools wheel \ + && pip3 install --no-cache-dir numpy Pillow matplotlib wordcloud openai dotenv \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + RUN apt-get autoremove -y && apt-get clean ENV CHROME_VERSION="128.0.6613.137" @@ -95,7 +105,6 @@ ENV PATH="/opt/chrome-linux64:${PATH}" RUN ln -s /lib/ld-linux.so.2 /lib/ld-linux.so.2 || true -# Обертка для wkhtmltopdf с xvfb RUN echo '#!/bin/bash\nxvfb-run -a --server-args="-screen 0, 1024x768x24" /usr/bin/wkhtmltopdf "$@"' > /usr/local/bin/wkhtmltopdf \ && chmod +x /usr/local/bin/wkhtmltopdf diff --git a/Gemfile b/Gemfile index 6f04e08d8..5bae52e19 100644 --- a/Gemfile +++ b/Gemfile @@ -113,3 +113,6 @@ gem 'net-ftp' # https://stackoverflow.com/questions/79360526/uninitialized-constant-activesupportloggerthreadsafelevellogger-nameerror gem 'concurrent-ruby', '1.3.4' + +# gives you access to stdin, stdout, and stderr when running other programs +gem 'open3' \ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock index aaa2d9a94..9c7de730f 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -394,6 +394,7 @@ GEM omniauth-rails_csrf_protection (0.1.2) actionpack (>= 4.2) omniauth (>= 1.3.1) + open3 (0.2.1) openid_connect (1.4.2) activemodel attr_required (>= 1.0.0) @@ -636,6 +637,7 @@ DEPENDENCIES nokogiri (~> 1.16.0) omniauth-rails_csrf_protection omniauth-tara! + open3 openssl paper_trail (~> 14.0) pdfkit diff --git a/app/controllers/admin/tools/wordcloud_controller.rb b/app/controllers/admin/tools/wordcloud_controller.rb new file mode 100644 index 000000000..30d78a6d3 --- /dev/null +++ b/app/controllers/admin/tools/wordcloud_controller.rb @@ -0,0 +1,175 @@ +# frozen_string_literal: true + +module Admin + module Tools + # Controller for the admin wordcloud generator tool that creates visual representations + # of the most common words used in domain names + class WordcloudController < BaseController # rubocop:disable Metrics/ClassLength + WORDCLOUD_DIR = Rails.root.join('public', 'wordcloud') + WORDCLOUD_IMAGE_PATH = WORDCLOUD_DIR.join('wordcloud.png') + WORDCLOUD_CONFIG_PATH = WORDCLOUD_DIR.join('config.json') + TOP_WORDS_PATH = WORDCLOUD_DIR.join('top_words.txt') + + before_action :authorize_admin + before_action :clear_cache, only: :create + before_action :ensure_wordcloud_dir, only: :create + + def index + # Load configuration + @config = load_wordcloud_config + + # Setup wordcloud data if image exists + if File.exist?(WORDCLOUD_IMAGE_PATH) + setup_wordcloud_data + else + @wordcloud_url = nil + end + end + + def create + # Validate domains file + if params[:domains_file].present? + domains_file_path = process_uploaded_file(params[:domains_file]) + return redirect_to admin_tools_wordcloud_path if domains_file_path.nil? + else + flash[:alert] = I18n.t('admin.tools.wordcloud_no_file') + return redirect_to admin_tools_wordcloud_path + end + + # Collect and save configuration + config = build_config_from_params + File.write(WORDCLOUD_CONFIG_PATH, config.to_json) + + # Start the background job + GenerateWordCloudJob.perform_later(domains_file_path.to_s, current_admin_user.id, config) + redirect_to progress_admin_tools_wordcloud_path + + rescue StandardError => e + logger.error "Error starting wordcloud generation: #{e.message}" + flash[:alert] = "#{I18n.t('admin.tools.wordcloud_error')}: #{e.message}" + redirect_to admin_tools_wordcloud_path + end + + # GET /admin/tools/wordcloud/progress + def progress + @progress_key = "wordcloud_progress:#{current_admin_user.id}" + @progress_data = Rails.cache.fetch(@progress_key) || { status: 'not_started', progress: 0 } + end + + # GET /admin/tools/wordcloud/status + def status + progress_key = "wordcloud_progress:#{current_admin_user.id}" + progress_data = Rails.cache.fetch(progress_key) || { status: 'not_started', progress: 0 } + + render json: progress_data + end + + private + + def ensure_wordcloud_dir + FileUtils.mkdir_p(WORDCLOUD_DIR) unless Dir.exist?(WORDCLOUD_DIR) + end + + def process_uploaded_file(uploaded_file) + # Create a persistent copy of the uploaded file + persistent_file_path = Rails.root.join('tmp', "domains_#{Time.now.to_i}.csv") + + # Copy the file content to a persistent location + FileUtils.cp(uploaded_file.tempfile.path, persistent_file_path) + + # Validate file has content + if File.size(persistent_file_path).zero? + File.delete(persistent_file_path) + flash[:alert] = I18n.t('admin.tools.wordcloud_empty_file') + return nil + end + + persistent_file_path + end + + def build_config_from_params + # Base configuration + config = { + width: params[:width].presence || 800, + height: params[:height].presence || 800, + max_words: params[:max_words].presence || 500, + background_color: params[:background_color].presence || 'white', + min_word_length: params[:min_word_length].presence || 2, + include_numbers: params[:include_numbers] == '1', + batch_size: params[:batch_size].presence || 500, + additional_prompt: params[:additional_prompt].presence || nil + } + + # Process additional stopwords + if params[:additional_stopwords].present? + stopwords = params[:additional_stopwords].downcase.split(/[\s,]+/).reject(&:empty?) + config[:additional_stopwords] = stopwords if stopwords.any? + end + + # Process special terms + if params[:special_terms].present? + special_terms = params[:special_terms].split(/[\s,]+/).reject(&:empty?) + config[:special_terms] = special_terms if special_terms.any? + end + + config + end + + def load_wordcloud_config + if File.exist?(WORDCLOUD_CONFIG_PATH) + begin + JSON.parse(File.read(WORDCLOUD_CONFIG_PATH)) + rescue JSON::ParserError + default_wordcloud_config + end + else + default_wordcloud_config + end + end + + def setup_wordcloud_data + # Add timestamp to prevent caching + @wordcloud_url = "/wordcloud/wordcloud.png?t=#{File.mtime(WORDCLOUD_IMAGE_PATH).to_i}" + + # Get the file's modification time and convert to application timezone + @wordcloud_generated_at = File.mtime(WORDCLOUD_IMAGE_PATH).in_time_zone(Time.zone) + + # Load top words + load_top_words + end + + def load_top_words + return unless File.exist?(TOP_WORDS_PATH) + + @top_words = [] + File.readlines(TOP_WORDS_PATH).each do |line| + if line =~ /^\d+\.\s+(\w+):\s+(\d+)$/ + @top_words << [$1, $2.to_i] + end + end + end + + def default_wordcloud_config + { + 'width' => 800, + 'height' => 800, + 'max_words' => 500, + 'background_color' => 'white', + 'additional_stopwords' => [], + 'include_numbers' => true, + 'min_word_length' => 2, + 'special_terms' => ['e-', 'i-', '2-', '3-', '4-', '.com', 'tr.ee', 'ai', 'web'], + 'batch_size' => 500 + } + end + + def authorize_admin + authorize! :access, :tools + end + + def clear_cache + Rails.cache.delete("wordcloud_progress:#{current_admin_user.id}") + end + end + end +end diff --git a/app/controllers/admin/tools_controller.rb b/app/controllers/admin/tools_controller.rb new file mode 100644 index 000000000..afc2891ac --- /dev/null +++ b/app/controllers/admin/tools_controller.rb @@ -0,0 +1,14 @@ +module Admin + class ToolsController < BaseController + before_action :authorize_admin + + # GET /admin/tools + def index; end + + private + + def authorize_admin + authorize! :access, :tools + end + end +end diff --git a/app/jobs/generate_word_cloud_job.rb b/app/jobs/generate_word_cloud_job.rb new file mode 100644 index 000000000..618d63404 --- /dev/null +++ b/app/jobs/generate_word_cloud_job.rb @@ -0,0 +1,119 @@ +# Use Open3 to capture output in real-time +require 'open3' + +# Background job that generates a wordcloud image from domain names +# using an external Python script with progress tracking +class GenerateWordCloudJob < ApplicationJob + def perform(domains_file_path, user_id, config = {}) + + Rails.logger.info("Generating wordcloud for #{domains_file_path}") + + @domains_file_path = domains_file_path + @user_id = user_id + @config = config + @progress_key = "wordcloud_progress:#{user_id}" + @wordcloud_dir = Rails.root.join('public', 'wordcloud') + @config_file_path = nil + + initialize_progress + + begin + setup_environment + run_wordcloud_script + rescue StandardError => e + handle_error(e) + ensure + cleanup + end + end + + private + + def initialize_progress + Rails.cache.write(@progress_key, { status: 'processing', progress: 0 }) + end + + def setup_environment + # Ensure the wordcloud directory exists + FileUtils.mkdir_p(@wordcloud_dir) unless Dir.exist?(@wordcloud_dir) + + # Create a config file for the Python script + @config_file_path = Rails.root.join(@wordcloud_dir, "wordcloud_config_#{Time.now.to_i}.json") + File.write(@config_file_path, @config.to_json) + + # Setup Python script + @script_path = Rails.root.join('lib', 'wordcloud', 'generate_wordcloud.py') + FileUtils.chmod('+x', @script_path) unless File.executable?(@script_path) + end + + def run_wordcloud_script + python_executable = ENV.fetch('PYTHON_EXECUTABLE', 'python3') + env = { 'PYTHONIOENCODING' => 'utf-8', 'PYTHONUNBUFFERED' => '1' } + + Open3.popen2e(env, python_executable, @script_path.to_s, @domains_file_path, + @wordcloud_dir.to_s, @config_file_path.to_s) do |stdin, stdout_err, wait_thr| + stdin.close + process_script_output(stdout_err, wait_thr) + end + end + + def process_script_output(stdout_err, wait_thr) + # Process output line by line + while line = stdout_err.gets + update_progress_from_output(line) + Rails.logger.info("WordCloud: #{line.strip}") + end + + # Process exit status + handle_exit_status(wait_thr.value) + end + + def update_progress_from_output(line) + case line + when %r{Processing batch (\d+)/(\d+)} + current, total = $1.to_i, $2.to_i + progress = ((current.to_f / total) * 80).round + update_progress(progress) + when /Total estimated cost/ + update_progress(80) + when /Generating word cloud/ + update_progress(90) + end + end + + def update_progress(value, status: 'processing') + Rails.cache.write(@progress_key, { status: status, progress: value }) + end + + def handle_exit_status(exit_status) + if exit_status.success? + update_progress(100, status: 'completed') + else + Rails.cache.write( + @progress_key, + { + status: 'failed', + progress: 0, + error: "Process failed with status #{exit_status.exitstatus}" + } + ) + end + end + + def handle_error(exception) + Rails.logger.error("Error in WordCloud job: #{exception.message}") + Rails.logger.error(exception.backtrace.join("\n")) + Rails.cache.write( + @progress_key, + { + status: 'failed', + progress: 0, + error: exception.message + } + ) + end + + def cleanup + File.delete(@config_file_path) if @config_file_path && File.exist?(@config_file_path) + end +end diff --git a/app/models/ability.rb b/app/models/ability.rb index ba6971dcf..eeec470f2 100644 --- a/app/models/ability.rb +++ b/app/models/ability.rb @@ -121,6 +121,7 @@ class Ability can :destroy, :pending can :create, :zonefile can :access, :settings_menu + can :access, :tools can :manage, :mass_actions can :manage, BouncedMailAddress end diff --git a/app/views/admin/base/_menu.haml b/app/views/admin/base/_menu.haml index f1e855742..73b4daf77 100644 --- a/app/views/admin/base/_menu.haml +++ b/app/views/admin/base/_menu.haml @@ -6,6 +6,8 @@ %li= link_to t(:contacts), admin_contacts_path - if can? :show, Registrar %li= link_to t(:registrars), admin_registrars_path + - if can?(:access, :tools) + %li= link_to t(:tools), admin_tools_path - if can?(:access, :settings_menu) %li.dropdown %a.dropdown-toggle{"data-toggle" => "dropdown", href: "#"} diff --git a/app/views/admin/tools/index.html.erb b/app/views/admin/tools/index.html.erb new file mode 100644 index 000000000..1b111132a --- /dev/null +++ b/app/views/admin/tools/index.html.erb @@ -0,0 +1,23 @@ +<%= render "shared/title", name: t('admin.tools.title') %> + +
+
+
+
+

<%= t('admin.tools.available_tools') %>

+
+
+
+
+
+

<%= t('admin.tools.wordcloud_generator') %>

+

<%= t('admin.tools.wordcloud_generator_description') %>

+ <%= link_to t('admin.tools.generate_wordcloud'), admin_tools_wordcloud_path, class: 'btn btn-primary' %> +
+
+ +
+
+
+
+
diff --git a/app/views/admin/tools/wordcloud/_form.html.erb b/app/views/admin/tools/wordcloud/_form.html.erb new file mode 100644 index 000000000..ea56f726d --- /dev/null +++ b/app/views/admin/tools/wordcloud/_form.html.erb @@ -0,0 +1,114 @@ +<%= form_tag admin_tools_wordcloud_path, method: :post, multipart: true do %> +
+
+
+
+

<%= t('admin.tools.wordcloud.custom_file_description') %>

+ <%= file_field_tag :domains_file, accept: '.csv', class: 'form-control' %> +
+
+
+
+ +
+
+

+ + <%= t('admin.tools.wordcloud.advanced_options') %> + +

+
+
+
+
+
+
+ <%= label_tag :width, t('admin.tools.wordcloud.width') %> + <%= number_field_tag :width, @config['width'], min: 400, max: 2000, step: 100, class: 'form-control' %> +
+
+
+
+ <%= label_tag :height, t('admin.tools.wordcloud.height') %> + <%= number_field_tag :height, @config['height'], min: 400, max: 2000, step: 100, class: 'form-control' %> +
+
+
+
+ <%= label_tag :max_words, t('admin.tools.wordcloud.max_words') %> + <%= number_field_tag :max_words, @config['max_words'], min: 100, max: 1000, step: 50, class: 'form-control' %> +
+
+
+ +
+
+
+ <%= label_tag :batch_size, t('admin.tools.wordcloud.batch_size') %> + <%= number_field_tag :batch_size, @config['batch_size'], min: 100, max: 1000, step: 50, class: 'form-control' %> + <%= t('admin.tools.wordcloud.batch_size_help') %> +
+
+
+
+ <%= label_tag :background_color, t('admin.tools.wordcloud.background') %> + <%= select_tag :background_color, + options_for_select([ + ['White', 'white'], + ['Black', 'black'], + ['Transparent', 'transparent'], + ['Light Gray', '#f0f0f0'] + ], @config['background_color']), + class: 'form-control' %> +
+
+
+ +
+
+
+ <%= label_tag :min_word_length, t('admin.tools.wordcloud.min_word_length') %> + <%= number_field_tag :min_word_length, @config['min_word_length'], min: 1, max: 5, class: 'form-control' %> +
+
+
+
+
+ +
+
+
+
+ +
+ <%= label_tag :special_terms, t('admin.tools.wordcloud.special_terms') %> + <%= text_field_tag :special_terms, @config['special_terms'].is_a?(Array) ? @config['special_terms'].join(', ') : '', + class: 'form-control', + placeholder: t('admin.tools.wordcloud.special_terms_placeholder') %> + <%= t('admin.tools.wordcloud.special_terms_help') %> +
+ +
+ <%= label_tag :additional_stopwords, t('admin.tools.wordcloud.additional_stopwords') %> + <%= text_area_tag :additional_stopwords, @config['additional_stopwords'].is_a?(Array) ? @config['additional_stopwords'].join(', ') : '', + rows: 3, + placeholder: t('admin.tools.wordcloud.stopwords_placeholder'), + class: 'form-control' %> + <%= t('admin.tools.wordcloud.stopwords_help') %> +
+ +
+ <%= label_tag :additional_prompt, t('admin.tools.wordcloud.additional_prompt') %> + <%= text_area_tag :additional_prompt, @config['additional_prompt'], class: "form-control", rows: 3, + placeholder: t('admin.tools.wordcloud.additional_prompt_placeholder') %> + <%= t('admin.tools.wordcloud.additional_prompt_help') %> +
+
+
+
+ + <%= submit_tag t('admin.tools.generate_wordcloud'), class: 'btn btn-primary btn-lg mt-3' %> +<% end %> \ No newline at end of file diff --git a/app/views/admin/tools/wordcloud/index.html.erb b/app/views/admin/tools/wordcloud/index.html.erb new file mode 100644 index 000000000..083c1b324 --- /dev/null +++ b/app/views/admin/tools/wordcloud/index.html.erb @@ -0,0 +1,92 @@ +<% content_for :actions do %> + <%= link_to t('back'), admin_tools_path, class: 'btn btn-default' %> +<% end %> +<%= render "shared/title", name: t('admin.tools.wordcloud.title') %> + + + +
+
+
+
+
+
+

<%= t('admin.tools.wordcloud.title') %>

+
+
+ <% if @wordcloud_url %> +
+ <%= link_to @wordcloud_url, target: "_blank", title: t('admin.tools.wordcloud.view_full_size') do %> + <%= image_tag @wordcloud_url, class: 'img-responsive', alt: t('admin.tools.wordcloud.title') %> +
+ <%= t('admin.tools.wordcloud.click_to_enlarge') %> +
+ <% end %> + + <% if @wordcloud_generated_at %> +
+ <%= t('admin.tools.wordcloud.generated_at', time: l(@wordcloud_generated_at, format: :long)) %> +
+ <% end %> +
+ <% end %> +
+

<%= t('admin.tools.wordcloud.instructions') %>

+
+ <%= render 'admin/tools/wordcloud/form' %> +
+
+
+ +
+
+
+

<%= t('admin.tools.wordcloud.top_words') %>

+
+
+ <% if @top_words && @top_words.any? %> +
    + <% @top_words.each do |word, count| %> +
  1. <%= word %>: <%= count %>
  2. + <% end %> +
+ <% else %> +

<%= t('admin.tools.wordcloud.top_words_empty') %>

+ <% end %> +
+
+
+
+
+
\ No newline at end of file diff --git a/app/views/admin/tools/wordcloud/progress.html.erb b/app/views/admin/tools/wordcloud/progress.html.erb new file mode 100644 index 000000000..2849b1794 --- /dev/null +++ b/app/views/admin/tools/wordcloud/progress.html.erb @@ -0,0 +1,93 @@ +
+
+
+

WordCloud Generation Progress

+ +
+
+
+
+ <%= @progress_data[:progress] %>% +
+
+ +
+ <% case @progress_data[:status] %> + <% when 'not_started' %> +
Waiting to start processing...
+ <% when 'processing' %> +
Processing in progress...
+ <% when 'completed' %> +
+ WordCloud generation completed! + <%= link_to "View WordCloud", admin_tools_wordcloud_path, class: "btn btn-primary" %> +
+ <% when 'failed' %> +
+ Error: <%= @progress_data[:error] || "Unknown error occurred" %> +
+ <% end %> +
+ +
+ <%= link_to "Back to Tools", admin_tools_path, class: "btn btn-secondary" %> +
+
+
+
+
+
+ + \ No newline at end of file diff --git a/config/environments/production.rb b/config/environments/production.rb index 2825185d0..e3f6591d6 100644 --- a/config/environments/production.rb +++ b/config/environments/production.rb @@ -54,6 +54,10 @@ Rails.application.configure do # Use a different cache store in production. # config.cache_store = :mem_cache_store + config.cache_store = :redis_cache_store, { + url: "#{ENV.fetch('REDIS_URL', 'redis://localhost:6379')}/1", + expires_in: 300.seconds + } # Use a real queuing backend for Active Job (and separate queues per environment) config.active_job.queue_adapter = :sidekiq diff --git a/config/initializers/sidekiq.rb b/config/initializers/sidekiq.rb index b2d067a2f..33392bcb8 100644 --- a/config/initializers/sidekiq.rb +++ b/config/initializers/sidekiq.rb @@ -2,7 +2,7 @@ require 'sidekiq/web' # Require at the top of the initializer Sidekiq.configure_server do |config| config.logger.level = Logger::INFO - + # Custom job logging format Sidekiq.logger.formatter = proc do |severity, datetime, progname, msg| thread_id = Thread.current.object_id.to_s(36) diff --git a/config/locales/admin/tools.en.yml b/config/locales/admin/tools.en.yml new file mode 100644 index 000000000..073be1b6c --- /dev/null +++ b/config/locales/admin/tools.en.yml @@ -0,0 +1,42 @@ +en: + admin: + tools: + title: "Administrative Tools" + available_tools: "Available Tools" + wordcloud_generator: "Domain Name Wordcloud Generator" + wordcloud_generator_description: "Generate a visual wordcloud from domain names in the registry" + generate_wordcloud: "Generate Wordcloud" + wordcloud: + title: "Domain Name Wordcloud" + success: "Wordcloud generated successfully" + error: "Error generating wordcloud" + processing: "Processing domain names. This may take a few minutes..." + instructions: "Generate a visual representation of the most common words in domain names. Click the button below to create the wordcloud." + top_words: "Top Words" + top_words_empty: "Generate a wordcloud to see the most frequent words." + click_to_enlarge: "Click to enlarge" + view_full_size: "View full size wordcloud image" + use_custom_domains: "Use custom domain list" + custom_file_description: "Upload a CSV file with one domain name per line" + file_upload_error: "Error processing uploaded file" + file_optional: "If no file is uploaded, all active domains in the registry will be used" + generated_at: "Generated at %{time}" + no_file: "No domain names found" + width: "Width" + height: "Height" + max_words: "Max Words" + background: "Background" + additional_stopwords: "Stopwords" + stopwords_placeholder: "Enter additional stopwords, one per line" + stopwords_help: "Stopwords are words that will not be included in the wordcloud" + advanced_options: "Advanced Options" + min_word_length: "Min Word Length" + include_numbers: "Include Numbers" + special_terms: "Special Terms" + special_terms_placeholder: "e.g., e-, i-, .com, ai, web" + special_terms_help: "These terms will be preserved in the word cloud even if they would normally be filtered out" + batch_size: "Batch Size" + batch_size_help: "Number of domains to process in each API call." + additional_prompt: "Additional Prompt Text" + additional_prompt_placeholder: "Add any additional instructions for the word cloud generation here..." + additional_prompt_help: "Optional text that will be used as additional context during word cloud generation." diff --git a/config/locales/en.yml b/config/locales/en.yml index fe98336f3..82e131a22 100644 --- a/config/locales/en.yml +++ b/config/locales/en.yml @@ -229,6 +229,7 @@ en: valid_from: 'Valid from' general: 'General' contacts: 'Contacts' + tools: 'Tools' identity_code: 'Identity code' nameservers: 'Nameservers' hostname: 'Hostname' diff --git a/config/routes.rb b/config/routes.rb index 50e72511b..aca21c4f2 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -250,6 +250,17 @@ Rails.application.routes.draw do end # post 'admi/upload_spreadsheet', to: 'customers#upload_spreadsheet', as: :customers_upload_spreadsheet + resources :tools, only: %i[index] + + namespace :tools do + resource :wordcloud, controller: 'wordcloud', only: %i[create] do + collection do + get '', to: 'wordcloud#index' + get 'progress', to: 'wordcloud#progress', as: :progress + get 'status', to: 'wordcloud#status', as: :status + end + end + end resources :bank_statements do resources :bank_transactions diff --git a/lib/wordcloud/fonts/Pacifico-Regular.ttf b/lib/wordcloud/fonts/Pacifico-Regular.ttf new file mode 100644 index 000000000..e7def95d3 Binary files /dev/null and b/lib/wordcloud/fonts/Pacifico-Regular.ttf differ diff --git a/lib/wordcloud/generate_wordcloud.py b/lib/wordcloud/generate_wordcloud.py new file mode 100644 index 000000000..e9eaf1ea2 --- /dev/null +++ b/lib/wordcloud/generate_wordcloud.py @@ -0,0 +1,351 @@ +#!/usr/bin/env python3 + +import os +import re +import sys +import json +import random +import asyncio +import numpy as np +from PIL import Image +from os import path +from wordcloud import WordCloud, STOPWORDS +from openai import AsyncOpenAI +import matplotlib.pyplot as plt +# import pandas as pd +from dotenv import load_dotenv +load_dotenv() + +BATCH_SIZE = int(os.environ.get("OPENAI_BATCH_SIZE", "20")) + +def load_system_prompt(): + """Loads system prompt from system_prompt.md file""" + prompt_file = path.join(path.dirname(__file__), 'system_prompt.md') + + if not path.exists(prompt_file): + raise FileNotFoundError(f"System prompt not found at {prompt_file}. Please create the file.") + + with open(prompt_file, 'r', encoding='utf-8') as f: + system_prompt = f.read() + + return system_prompt + +d = path.dirname(__file__) if "__file__" in locals() else os.getcwd() + +output_dir = sys.argv[2] if len(sys.argv) > 2 else d + +try: + SYSTEM_PROMPT = load_system_prompt() + print("System prompt successfully loaded from file.") +except FileNotFoundError as e: + print(f"Error: {e}") + sys.exit(1) + +# Load configuration if provided +config = {} +if len(sys.argv) > 3 and sys.argv[3]: + config_file = sys.argv[3] + if path.exists(config_file): + with open(config_file, 'r') as f: + config = json.load(f) + print(f"Loaded configuration: {config}") + +# Check if domains file path is provided and exists +if len(sys.argv) > 1 and sys.argv[1]: + domains_file = sys.argv[1] + if not path.exists(domains_file): + print(f"Error: Provided domains file {domains_file} not found") + sys.exit(1) +else: + print(f"Error: Domains file not found") + sys.exit(1) + +# Read domain names from the file +with open(domains_file, 'r', encoding='utf-8') as f: + domain_names = [line.strip().lower() for line in f if line.strip()] + +if not domain_names: + print("Error: No domain names found in the provided file") + sys.exit(1) + + +# Function to extract words using OpenAI API asynchronously +async def extract_words_with_openai(domain_names, batch_size=BATCH_SIZE): + filtered_domains = [] + + # Filter out domains that are only numbers + for domain in domain_names: + domain_core = domain.lower().replace('www.', '') + main_part = domain_core.split('.')[0] + if not main_part.isdigit(): + filtered_domains.append(domain) + + + # Get API key from environment variable + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + raise ValueError("OpenAI API key not found. Set the OPENAI_API_KEY environment variable.") + + # Initialize AsyncOpenAI client + client = AsyncOpenAI(api_key=api_key) + + # Get model and temperature from environment variables + model = os.environ.get("OPENAI_MODEL", "gpt-4o-2024-11-20") + temperature = float(os.environ.get("OPENAI_TEMPERATURE", "0")) + max_tokens = int(os.environ.get("OPENAI_MAX_TOKENS", "16000")) + + # Process domains in batches + all_words = [] + total_prompt_tokens = 0 + total_completion_tokens = 0 + total_cost = 0 + + # Calculate number of batches + num_batches = (len(filtered_domains) + batch_size - 1) // batch_size + + # Create semaphore to limit concurrent requests + semaphore = asyncio.Semaphore(10) # Limit to 10 concurrent requests + + async def process_batch(batch_idx): + async with semaphore: + start_idx = batch_idx * batch_size + end_idx = min(start_idx + batch_size, len(filtered_domains)) + batch = filtered_domains[start_idx:end_idx] + + print(f"Processing batch {batch_idx + 1}/{num_batches} ({len(batch)} domains)...") + sys.stdout.flush() + + # Prepare the prompt with domain names and special terms + domains_text = "\n".join(batch) + prompt = f"List of domain names: {domains_text}" + + # Make the API call + try: + print(f"Using model: {model} with temperature: {temperature}") + response = await client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": prompt} + ], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "domain_analysis_results", + "strict": True, + "schema": { + "type": "object", + "properties": { + "results": { + "type": "array", + "description": "A list of analysis results for the provided domains.", + "items": { + "type": "object", + "properties": { + "Language": { + "type": "string", + "description": "The language identified in the domain name." + }, + "is_splitted": { + "type": "string", + "description": "Indicates whether the domain name is split into recognizable words." + }, + "reasoning": { + "type": "string", + "description": "Explanation of the reasoning behind the language and word identification." + }, + "words": { + "type": "array", + "description": "The words identified in the domain name.", + "items": { + "type": "string" + } + } + }, + "required": [ + "Language", + "is_splitted", + "reasoning", + "words" + ], + "additionalProperties": False + } + } + }, + "required": [ + "results" + ], + "additionalProperties": False + } + } + }, + temperature=temperature, + max_tokens=max_tokens, + ) + + # Track token usage + prompt_tokens = response.usage.prompt_tokens + completion_tokens = response.usage.completion_tokens + total_tokens = response.usage.total_tokens + + nonlocal total_prompt_tokens, total_completion_tokens + total_prompt_tokens += prompt_tokens + total_completion_tokens += completion_tokens + + print(f"Token usage - Prompt: {prompt_tokens}, Completion: {completion_tokens}, Total: {total_tokens}") + + # Calculate cost (approximate, based on current pricing) + if "gpt-4.1" in model: + prompt_cost = (prompt_tokens / 1000000) * 2.00 # $2.00 per 1M tokens for GPT-4.1 input + completion_cost = (completion_tokens / 1000000) * 8.00 # $8.00 per 1M tokens for GPT-4.1 output + else: + prompt_cost = 0 + completion_cost = 0 + + batch_cost = prompt_cost + completion_cost + nonlocal total_cost + total_cost += batch_cost + print(f"Estimated batch cost: ${batch_cost:.6f}") + + # Extract the words from the response + response_json = json.loads(response.choices[0].message.content) + batch_words = [] + for result in response_json['results']: + if result['Language'] == 'Ignore': + continue + batch_words.extend(result['words']) + + print(f"Extracted {len(batch_words)} words from this batch") + return batch_words + + except Exception as e: + print(f"Error calling OpenAI API for batch: {e}") + return [] + + # Create tasks for each batch + tasks = [] + for batch_idx in range(num_batches): + tasks.append(process_batch(batch_idx)) + + # Run all tasks concurrently and wait for results + batch_results = await asyncio.gather(*tasks) + + # Combine all words from all batches + for batch_words in batch_results: + all_words.extend(batch_words) + + print(f"Total token usage - Prompt: {total_prompt_tokens}, Completion: {total_completion_tokens}") + print(f"Total estimated cost: ${total_cost:.6f}") + + return all_words + +# Replace the synchronous call with an async function +async def main(): + # Process domain names using OpenAI + print("Extracting words from domain names using OpenAI...") + extracted_words = await extract_words_with_openai(domain_names) + print(f"Extracted {len(extracted_words)} words") + + # Join the extracted words for the word cloud + processed_text = ' '.join(extracted_words) + + def custom_color_func(word, font_size, position, orientation, random_state=None, + **kwargs): + return "hsl(215, 100%%, %d%%)" % random.randint(15, 80) + + mask = np.array(Image.open(path.join(d, 'mask.png'))) + + # Get configuration values with defaults + width = int(config.get('width', 800)) + height = int(config.get('height', 800)) + max_words = int(config.get('max_words', 500)) + background_color = config.get('background_color', 'white') + min_word_length = int(config.get('min_word_length', 2)) + include_numbers = config.get('include_numbers', True) + + # Handle transparent background + if background_color == 'transparent': + background_color = None + + # Get additional stopwords + additional_stopwords = config.get('additional_stopwords', []) + + stopwords = set(STOPWORDS) + stopwords = { + 'ja', 'ning', 'et', 'kui', 'aga', 'ka', 'ei', 'see', 'on', 'ole', + 'oma', 'seda', 'siis', 'või', 'mis', 'nii', 'veel', 'kes', 'üle', + 'välja', 'olema', 'kus', 'nagu', 'kuid', 'selle', 'pole', 'ära', + 'vaid', 'sest', 'juba', 'meie', 'mida', 'need', 'olid', 'minu', + 'tema', 'pärast', 'mingi', 'palju', 'kõik', 'seal', 'olen', 'oled', + 'oli', 'olnud', 'ongi', 'poolt', 'meil', 'teda', 'just', 'kuna', + 'läbi', 'küll', + 'the', 'and', 'a', 'to', 'of', 'in', 'is', 'that', 'it', 'for', + 'with', 'as', 'be', 'on', 'not', 'this', 'but', 'by', 'from', 'are', + 'or', 'an', 'at', 'was', 'have', 'has', 'had', 'were', 'will', 'would', + 'should', 'can', 'could', 'may', 'might', 'must', 'do', 'does', 'did', + 'doing', 'done', 'their', 'they', 'them', 'there', 'these', 'those', + 'which', 'who', 'whom', 'whose', 'what', 'when', 'where', 'why', 'how' + } + + stopwords.update(stopwords) + stopwords.update(additional_stopwords) + + font_path = path.join(d, 'fonts', 'Pacifico-Regular.ttf') + # Alternative: use a system font + # font_path = fm.findfont(fm.FontProperties(family='Arial')) + + print("Generating word cloud...") + wc = WordCloud(width=width, height=height, + mask=mask, + stopwords=stopwords, + background_color=background_color, + max_words=max_words, + include_numbers=include_numbers, + collocations=False, + min_word_length=min_word_length, + regexp=r"[A-Za-zÕÄÖÜõäöü0-9][\w\-'ÕÄÖÜõäöü]*(?