mirror of
https://github.com/internetee/registry.git
synced 2025-08-17 15:03:59 +02:00
implement async request to ai model and structured output
This commit is contained in:
parent
5b6888eb43
commit
aa4d36a0ad
10 changed files with 304 additions and 194 deletions
11
Dockerfile
11
Dockerfile
|
@ -62,6 +62,16 @@ RUN apt-get install -y --no-install-recommends > /dev/null \
|
||||||
&& apt-get clean \
|
&& apt-get clean \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install Python packages for wordcloud generation
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
python3-pip \
|
||||||
|
python3-setuptools \
|
||||||
|
python3-dev \
|
||||||
|
&& pip3 install --upgrade pip setuptools wheel \
|
||||||
|
&& pip3 install --no-cache-dir numpy Pillow matplotlib wordcloud openai dotenv \
|
||||||
|
&& apt-get clean \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN apt-get autoremove -y && apt-get clean
|
RUN apt-get autoremove -y && apt-get clean
|
||||||
|
|
||||||
ENV CHROME_VERSION="128.0.6613.137"
|
ENV CHROME_VERSION="128.0.6613.137"
|
||||||
|
@ -95,7 +105,6 @@ ENV PATH="/opt/chrome-linux64:${PATH}"
|
||||||
|
|
||||||
RUN ln -s /lib/ld-linux.so.2 /lib/ld-linux.so.2 || true
|
RUN ln -s /lib/ld-linux.so.2 /lib/ld-linux.so.2 || true
|
||||||
|
|
||||||
# Обертка для wkhtmltopdf с xvfb
|
|
||||||
RUN echo '#!/bin/bash\nxvfb-run -a --server-args="-screen 0, 1024x768x24" /usr/bin/wkhtmltopdf "$@"' > /usr/local/bin/wkhtmltopdf \
|
RUN echo '#!/bin/bash\nxvfb-run -a --server-args="-screen 0, 1024x768x24" /usr/bin/wkhtmltopdf "$@"' > /usr/local/bin/wkhtmltopdf \
|
||||||
&& chmod +x /usr/local/bin/wkhtmltopdf
|
&& chmod +x /usr/local/bin/wkhtmltopdf
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,9 @@ require 'open3'
|
||||||
# using an external Python script with progress tracking
|
# using an external Python script with progress tracking
|
||||||
class GenerateWordCloudJob < ApplicationJob
|
class GenerateWordCloudJob < ApplicationJob
|
||||||
def perform(domains_file_path, user_id, config = {})
|
def perform(domains_file_path, user_id, config = {})
|
||||||
|
|
||||||
|
Rails.logger.info("Generating wordcloud for #{domains_file_path}")
|
||||||
|
|
||||||
@domains_file_path = domains_file_path
|
@domains_file_path = domains_file_path
|
||||||
@user_id = user_id
|
@user_id = user_id
|
||||||
@config = config
|
@config = config
|
||||||
|
|
|
@ -5,19 +5,42 @@ import re
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
|
import asyncio
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from os import path
|
from os import path
|
||||||
from wordcloud import WordCloud, STOPWORDS
|
from wordcloud import WordCloud, STOPWORDS
|
||||||
import openai
|
from openai import AsyncOpenAI
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
# import pandas as pd
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
BATCH_SIZE = int(os.environ.get("OPENAI_BATCH_SIZE", "20"))
|
||||||
|
|
||||||
|
def load_system_prompt():
|
||||||
|
"""Loads system prompt from system_prompt.md file"""
|
||||||
|
prompt_file = path.join(path.dirname(__file__), 'system_prompt.md')
|
||||||
|
|
||||||
|
if not path.exists(prompt_file):
|
||||||
|
raise FileNotFoundError(f"System prompt not found at {prompt_file}. Please create the file.")
|
||||||
|
|
||||||
|
with open(prompt_file, 'r', encoding='utf-8') as f:
|
||||||
|
system_prompt = f.read()
|
||||||
|
|
||||||
|
return system_prompt
|
||||||
|
|
||||||
d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
|
d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
|
||||||
|
|
||||||
output_dir = sys.argv[2] if len(sys.argv) > 2 else d
|
output_dir = sys.argv[2] if len(sys.argv) > 2 else d
|
||||||
|
|
||||||
|
try:
|
||||||
|
SYSTEM_PROMPT = load_system_prompt()
|
||||||
|
print("System prompt successfully loaded from file.")
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
print(f"Error: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
# Load configuration if provided
|
# Load configuration if provided
|
||||||
config = {}
|
config = {}
|
||||||
if len(sys.argv) > 3 and sys.argv[3]:
|
if len(sys.argv) > 3 and sys.argv[3]:
|
||||||
|
@ -45,29 +68,31 @@ if not domain_names:
|
||||||
print("Error: No domain names found in the provided file")
|
print("Error: No domain names found in the provided file")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# Get special terms from config or use defaults
|
|
||||||
SPECIAL_TERMS = config.get('special_terms', ['e-', 'i-', '2-', '3-', '4-', '.com', 'tr.ee', 'ai', 'web'])
|
|
||||||
print(f"Using special terms: {SPECIAL_TERMS}")
|
|
||||||
|
|
||||||
# Get batch size from config or use default
|
# Function to extract words using OpenAI API asynchronously
|
||||||
BATCH_SIZE = int(config.get('batch_size', 500))
|
async def extract_words_with_openai(domain_names, batch_size=BATCH_SIZE):
|
||||||
print(f"Using batch size: {BATCH_SIZE}")
|
filtered_domains = []
|
||||||
|
|
||||||
|
# Filter out domains that are only numbers
|
||||||
|
for domain in domain_names:
|
||||||
|
domain_core = domain.lower().replace('www.', '')
|
||||||
|
main_part = domain_core.split('.')[0]
|
||||||
|
if not main_part.isdigit():
|
||||||
|
filtered_domains.append(domain)
|
||||||
|
|
||||||
# Get additional prompt from config or use default
|
|
||||||
ADDITIONAL_PROMPT = config.get('additional_prompt', None)
|
|
||||||
print(f"Using additional prompt: {ADDITIONAL_PROMPT}")
|
|
||||||
|
|
||||||
# Function to extract words using OpenAI API
|
|
||||||
def extract_words_with_openai(domain_names, special_terms, batch_size=BATCH_SIZE, additional_prompt=ADDITIONAL_PROMPT):
|
|
||||||
# Get API key from environment variable
|
# Get API key from environment variable
|
||||||
api_key = os.environ.get("OPENAI_API_KEY")
|
api_key = os.environ.get("OPENAI_API_KEY")
|
||||||
if not api_key:
|
if not api_key:
|
||||||
raise ValueError("OpenAI API key not found. Set the OPENAI_API_KEY environment variable.")
|
raise ValueError("OpenAI API key not found. Set the OPENAI_API_KEY environment variable.")
|
||||||
|
|
||||||
|
# Initialize AsyncOpenAI client
|
||||||
|
client = AsyncOpenAI(api_key=api_key)
|
||||||
|
|
||||||
# Get model and temperature from environment variables
|
# Get model and temperature from environment variables
|
||||||
model = os.environ.get("OPENAI_MODEL", "gpt-4.1-2025-04-14")
|
model = os.environ.get("OPENAI_MODEL", "gpt-4.1-2025-04-14")
|
||||||
temperature = float(os.environ.get("OPENAI_TEMPERATURE", "0.3"))
|
temperature = float(os.environ.get("OPENAI_TEMPERATURE", "0"))
|
||||||
max_tokens = int(os.environ.get("OPENAI_MAX_TOKENS", "2000"))
|
max_tokens = int(os.environ.get("OPENAI_MAX_TOKENS", "16000"))
|
||||||
|
|
||||||
# Process domains in batches
|
# Process domains in batches
|
||||||
all_words = []
|
all_words = []
|
||||||
|
@ -76,199 +101,251 @@ def extract_words_with_openai(domain_names, special_terms, batch_size=BATCH_SIZE
|
||||||
total_cost = 0
|
total_cost = 0
|
||||||
|
|
||||||
# Calculate number of batches
|
# Calculate number of batches
|
||||||
num_batches = (len(domain_names) + batch_size - 1) // batch_size
|
num_batches = (len(filtered_domains) + batch_size - 1) // batch_size
|
||||||
|
|
||||||
for i in range(0, len(domain_names), batch_size):
|
# Create semaphore to limit concurrent requests
|
||||||
batch = domain_names[i:i+batch_size]
|
semaphore = asyncio.Semaphore(8) # Limit to 5 concurrent requests
|
||||||
print(f"Processing batch {i//batch_size + 1}/{num_batches} ({len(batch)} domains)...")
|
|
||||||
sys.stdout.flush()
|
|
||||||
|
|
||||||
# Prepare the prompt with domain names and special terms
|
async def process_batch(batch_idx):
|
||||||
domains_text = "\n".join(batch)
|
async with semaphore:
|
||||||
special_terms_text = ", ".join([f"`{term}`" for term in special_terms])
|
start_idx = batch_idx * batch_size
|
||||||
|
end_idx = min(start_idx + batch_size, len(filtered_domains))
|
||||||
|
batch = filtered_domains[start_idx:end_idx]
|
||||||
|
|
||||||
prompt = f"""You are a bilingual Estonian-English linguist and word segmentation expert. I will give you a list of .ee domain names.
|
print(f"Processing batch {batch_idx + 1}/{num_batches} ({len(batch)} domains)...")
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
Your task is to extract a clean list of words for word cloud generation.
|
# Prepare the prompt with domain names and special terms
|
||||||
|
domains_text = "\n".join(batch)
|
||||||
|
prompt = f"List of domain names: {domains_text}"
|
||||||
|
|
||||||
Follow these rules strictly:
|
# Make the API call
|
||||||
1. Before doing anything else, always extract and separate these predefined special terms if they appear as prefixes or parts of the domain name: {special_terms_text}. Keep symbols and numbers as they are. For example, if the domain name is `e-robot.ee`, the output should be `e- robot`. Remove extensions from the special terms.
|
try:
|
||||||
2. If a word contains a number (e.g., `auto24`), separate the number and the word: `auto`, `24`.
|
print(f"Using model: {model} with temperature: {temperature}")
|
||||||
3. If the domain name is a compound of 2+ Estonian or English words (e.g., `virtuaalabiline` or `doorkeeper`), intelligently split them into individual meaningful components. Prioritize Estonian words over English words.
|
response = await client.chat.completions.create(
|
||||||
4. Keep all resulting words in lowercase and remove the `.ee` extension from all the words
|
model=model,
|
||||||
5. Try to find the most common words and phrases in the domain names.
|
messages=[
|
||||||
6. Return ONLY a space-separated list of words and numberswith no explanations, no formatting, no introductions, and no additional text.
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": prompt}
|
||||||
|
],
|
||||||
|
response_format={
|
||||||
|
"type": "json_schema",
|
||||||
|
"json_schema": {
|
||||||
|
"name": "domain_analysis_results",
|
||||||
|
"strict": True,
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"results": {
|
||||||
|
"type": "array",
|
||||||
|
"description": "A list of analysis results for the provided domains.",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"Language": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The language identified in the domain name."
|
||||||
|
},
|
||||||
|
"is_splitted": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Indicates whether the domain name is split into recognizable words."
|
||||||
|
},
|
||||||
|
"reasoning": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Explanation of the reasoning behind the language and word identification."
|
||||||
|
},
|
||||||
|
"words": {
|
||||||
|
"type": "array",
|
||||||
|
"description": "The words identified in the domain name.",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"Language",
|
||||||
|
"is_splitted",
|
||||||
|
"reasoning",
|
||||||
|
"words"
|
||||||
|
],
|
||||||
|
"additionalProperties": False
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"results"
|
||||||
|
],
|
||||||
|
"additionalProperties": False
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
temperature=temperature,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
{additional_prompt}
|
# Track token usage
|
||||||
|
prompt_tokens = response.usage.prompt_tokens
|
||||||
|
completion_tokens = response.usage.completion_tokens
|
||||||
|
total_tokens = response.usage.total_tokens
|
||||||
|
|
||||||
Example output format:
|
nonlocal total_prompt_tokens, total_completion_tokens
|
||||||
word1 word2 word3 word4 word5
|
total_prompt_tokens += prompt_tokens
|
||||||
|
total_completion_tokens += completion_tokens
|
||||||
|
|
||||||
Here are the domain names:
|
print(f"Token usage - Prompt: {prompt_tokens}, Completion: {completion_tokens}, Total: {total_tokens}")
|
||||||
{domains_text}
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Make the API call
|
# Calculate cost (approximate, based on current pricing)
|
||||||
try:
|
if "gpt-4.1" in model:
|
||||||
print(f"Using model: {model} with temperature: {temperature}")
|
prompt_cost = (prompt_tokens / 1000000) * 2.00 # $2.00 per 1M tokens for GPT-4.1 input
|
||||||
response = openai.chat.completions.create(
|
completion_cost = (completion_tokens / 1000000) * 8.00 # $8.00 per 1M tokens for GPT-4.1 output
|
||||||
model=model,
|
else:
|
||||||
messages=[
|
prompt_cost = 0
|
||||||
{"role": "system", "content": "You are a helpful assistant that extracts words from domain names. You ONLY output the extracted words with no additional text."},
|
completion_cost = 0
|
||||||
{"role": "user", "content": prompt}
|
|
||||||
],
|
|
||||||
temperature=temperature,
|
|
||||||
max_tokens=max_tokens
|
|
||||||
)
|
|
||||||
|
|
||||||
# Track token usage
|
batch_cost = prompt_cost + completion_cost
|
||||||
prompt_tokens = response.usage.prompt_tokens
|
nonlocal total_cost
|
||||||
completion_tokens = response.usage.completion_tokens
|
total_cost += batch_cost
|
||||||
total_tokens = response.usage.total_tokens
|
print(f"Estimated batch cost: ${batch_cost:.6f}")
|
||||||
|
|
||||||
total_prompt_tokens += prompt_tokens
|
# Extract the words from the response
|
||||||
total_completion_tokens += completion_tokens
|
response_json = json.loads(response.choices[0].message.content)
|
||||||
|
batch_words = []
|
||||||
|
for result in response_json['results']:
|
||||||
|
if result['Language'] == 'Ignore':
|
||||||
|
continue
|
||||||
|
batch_words.extend(result['words'])
|
||||||
|
|
||||||
print(f"Token usage - Prompt: {prompt_tokens}, Completion: {completion_tokens}, Total: {total_tokens}")
|
print(f"Extracted {len(batch_words)} words from this batch")
|
||||||
|
return batch_words
|
||||||
|
|
||||||
# Calculate cost (approximate, based on current pricing)
|
except Exception as e:
|
||||||
if "gpt-4.1" in model:
|
print(f"Error calling OpenAI API for batch: {e}")
|
||||||
prompt_cost = (prompt_tokens / 1000000) * 2.00 # $2.00 per 1M tokens for GPT-4.1 input
|
return []
|
||||||
completion_cost = (completion_tokens / 1000000) * 8.00 # $8.00 per 1M tokens for GPT-4.1 output
|
|
||||||
else:
|
|
||||||
prompt_cost = 0
|
|
||||||
completion_cost = 0
|
|
||||||
|
|
||||||
batch_cost = prompt_cost + completion_cost
|
# Create tasks for each batch
|
||||||
total_cost += batch_cost
|
tasks = []
|
||||||
print(f"Estimated batch cost: ${batch_cost:.6f}")
|
for batch_idx in range(num_batches):
|
||||||
|
tasks.append(process_batch(batch_idx))
|
||||||
|
|
||||||
# Extract the words from the response
|
# Run all tasks concurrently and wait for results
|
||||||
words_text = response.choices[0].message.content.strip()
|
batch_results = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
# Process the response to get a clean list of words
|
# Combine all words from all batches
|
||||||
batch_words = []
|
for batch_words in batch_results:
|
||||||
for line in words_text.split('\n'):
|
all_words.extend(batch_words)
|
||||||
line = line.strip()
|
|
||||||
if line and not line.startswith('```') and not line.endswith('```'):
|
|
||||||
# Remove any list markers like "1. ", "- ", etc.
|
|
||||||
cleaned_line = re.sub(r'^[\d\-\*\•\.\s]+', '', line)
|
|
||||||
if cleaned_line:
|
|
||||||
batch_words.extend(cleaned_line.split())
|
|
||||||
|
|
||||||
all_words.extend(batch_words)
|
|
||||||
print(f"Extracted {len(batch_words)} words from this batch")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error calling OpenAI API for batch: {e}")
|
|
||||||
|
|
||||||
print(f"Total token usage - Prompt: {total_prompt_tokens}, Completion: {total_completion_tokens}")
|
print(f"Total token usage - Prompt: {total_prompt_tokens}, Completion: {total_completion_tokens}")
|
||||||
print(f"Total estimated cost: ${total_cost:.6f}")
|
print(f"Total estimated cost: ${total_cost:.6f}")
|
||||||
|
|
||||||
return all_words
|
return all_words
|
||||||
|
|
||||||
# Process domain names using OpenAI
|
# Replace the synchronous call with an async function
|
||||||
print("Extracting words from domain names using OpenAI...")
|
async def main():
|
||||||
extracted_words = extract_words_with_openai(domain_names, SPECIAL_TERMS)
|
# Process domain names using OpenAI
|
||||||
print(f"Extracted {len(extracted_words)} words")
|
print("Extracting words from domain names using OpenAI...")
|
||||||
# print("Sample of extracted words:", extracted_words)
|
extracted_words = await extract_words_with_openai(domain_names)
|
||||||
|
print(f"Extracted {len(extracted_words)} words")
|
||||||
|
|
||||||
# Join the extracted words for the word cloud
|
# Join the extracted words for the word cloud
|
||||||
processed_text = ' '.join(extracted_words)
|
processed_text = ' '.join(extracted_words)
|
||||||
# print("Processed text sample:", processed_text)
|
|
||||||
|
|
||||||
def custom_color_func(word, font_size, position, orientation, random_state=None,
|
def custom_color_func(word, font_size, position, orientation, random_state=None,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
return "hsl(215, 100%%, %d%%)" % random.randint(15, 80)
|
return "hsl(215, 100%%, %d%%)" % random.randint(15, 80)
|
||||||
|
|
||||||
mask = np.array(Image.open(path.join(d, 'mask.png')))
|
mask = np.array(Image.open(path.join(d, 'mask.png')))
|
||||||
|
|
||||||
# Get configuration values with defaults
|
# Get configuration values with defaults
|
||||||
width = int(config.get('width', 800))
|
width = int(config.get('width', 800))
|
||||||
height = int(config.get('height', 800))
|
height = int(config.get('height', 800))
|
||||||
max_words = int(config.get('max_words', 500))
|
max_words = int(config.get('max_words', 500))
|
||||||
background_color = config.get('background_color', 'white')
|
background_color = config.get('background_color', 'white')
|
||||||
min_word_length = int(config.get('min_word_length', 2))
|
min_word_length = int(config.get('min_word_length', 2))
|
||||||
include_numbers = config.get('include_numbers', True)
|
include_numbers = config.get('include_numbers', True)
|
||||||
|
|
||||||
# Handle transparent background
|
# Handle transparent background
|
||||||
if background_color == 'transparent':
|
if background_color == 'transparent':
|
||||||
background_color = None
|
background_color = None
|
||||||
|
|
||||||
# Get additional stopwords
|
# Get additional stopwords
|
||||||
additional_stopwords = config.get('additional_stopwords', [])
|
additional_stopwords = config.get('additional_stopwords', [])
|
||||||
|
|
||||||
stopwords = set(STOPWORDS)
|
stopwords = set(STOPWORDS)
|
||||||
stopwords = {
|
stopwords = {
|
||||||
'ja', 'ning', 'et', 'kui', 'aga', 'ka', 'ei', 'see', 'on', 'ole',
|
'ja', 'ning', 'et', 'kui', 'aga', 'ka', 'ei', 'see', 'on', 'ole',
|
||||||
'oma', 'seda', 'siis', 'või', 'mis', 'nii', 'veel', 'kes', 'üle',
|
'oma', 'seda', 'siis', 'või', 'mis', 'nii', 'veel', 'kes', 'üle',
|
||||||
'välja', 'olema', 'kus', 'nagu', 'kuid', 'selle', 'pole', 'ära',
|
'välja', 'olema', 'kus', 'nagu', 'kuid', 'selle', 'pole', 'ära',
|
||||||
'vaid', 'sest', 'juba', 'meie', 'mida', 'need', 'olid', 'minu',
|
'vaid', 'sest', 'juba', 'meie', 'mida', 'need', 'olid', 'minu',
|
||||||
'tema', 'pärast', 'mingi', 'palju', 'kõik', 'seal', 'olen', 'oled',
|
'tema', 'pärast', 'mingi', 'palju', 'kõik', 'seal', 'olen', 'oled',
|
||||||
'oli', 'olnud', 'ongi', 'poolt', 'meil', 'teda', 'just', 'kuna',
|
'oli', 'olnud', 'ongi', 'poolt', 'meil', 'teda', 'just', 'kuna',
|
||||||
'läbi', 'küll',
|
'läbi', 'küll',
|
||||||
'the', 'and', 'a', 'to', 'of', 'in', 'is', 'that', 'it', 'for',
|
'the', 'and', 'a', 'to', 'of', 'in', 'is', 'that', 'it', 'for',
|
||||||
'with', 'as', 'be', 'on', 'not', 'this', 'but', 'by', 'from', 'are',
|
'with', 'as', 'be', 'on', 'not', 'this', 'but', 'by', 'from', 'are',
|
||||||
'or', 'an', 'at', 'was', 'have', 'has', 'had', 'were', 'will', 'would',
|
'or', 'an', 'at', 'was', 'have', 'has', 'had', 'were', 'will', 'would',
|
||||||
'should', 'can', 'could', 'may', 'might', 'must', 'do', 'does', 'did',
|
'should', 'can', 'could', 'may', 'might', 'must', 'do', 'does', 'did',
|
||||||
'doing', 'done', 'their', 'they', 'them', 'there', 'these', 'those',
|
'doing', 'done', 'their', 'they', 'them', 'there', 'these', 'those',
|
||||||
'which', 'who', 'whom', 'whose', 'what', 'when', 'where', 'why', 'how'
|
'which', 'who', 'whom', 'whose', 'what', 'when', 'where', 'why', 'how'
|
||||||
}
|
}
|
||||||
|
|
||||||
stopwords.update(stopwords)
|
stopwords.update(stopwords)
|
||||||
stopwords.update(additional_stopwords)
|
stopwords.update(additional_stopwords)
|
||||||
|
|
||||||
font_path = path.join(d, 'fonts', 'Pacifico-Regular.ttf')
|
font_path = path.join(d, 'fonts', 'Pacifico-Regular.ttf')
|
||||||
# Alternative: use a system font
|
# Alternative: use a system font
|
||||||
# font_path = fm.findfont(fm.FontProperties(family='Arial'))
|
# font_path = fm.findfont(fm.FontProperties(family='Arial'))
|
||||||
|
|
||||||
print("Generating word cloud...")
|
print("Generating word cloud...")
|
||||||
wc = WordCloud(width=width, height=height,
|
wc = WordCloud(width=width, height=height,
|
||||||
mask=mask,
|
mask=mask,
|
||||||
stopwords=stopwords,
|
stopwords=stopwords,
|
||||||
background_color=background_color,
|
background_color=background_color,
|
||||||
max_words=max_words,
|
max_words=max_words,
|
||||||
include_numbers=include_numbers,
|
include_numbers=include_numbers,
|
||||||
collocations=False,
|
collocations=False,
|
||||||
min_word_length=min_word_length,
|
min_word_length=min_word_length,
|
||||||
regexp=r"[A-Za-zÕÄÖÜõäöü0-9][\w\-'ÕÄÖÜõäöü]*(?<!\.ee)(?<!ee)",
|
regexp=r"[A-Za-zÕÄÖÜõäöü0-9][\w\-'ÕÄÖÜõäöü]*(?<!\.ee)(?<!ee)",
|
||||||
font_path=font_path)
|
font_path=font_path)
|
||||||
|
|
||||||
wc.generate(processed_text)
|
wc.generate(processed_text)
|
||||||
|
|
||||||
# Get word frequencies from the word cloud
|
# Get word frequencies from the word cloud
|
||||||
word_frequencies = wc.process_text(processed_text)
|
word_frequencies = wc.process_text(processed_text)
|
||||||
# Remove stopwords from the frequencies
|
# Remove stopwords from the frequencies
|
||||||
word_frequencies = {word: freq for word, freq in word_frequencies.items()
|
word_frequencies = {word: freq for word, freq in word_frequencies.items()
|
||||||
if word.lower() not in stopwords}
|
if word.lower() not in stopwords}
|
||||||
|
|
||||||
# Sort words by frequency (highest first)
|
# Sort words by frequency (highest first)
|
||||||
sorted_words = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)
|
sorted_words = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
# Get top 10 words
|
# Get top 10 words
|
||||||
top_10_words = sorted_words[:10]
|
top_10_words = sorted_words[:10]
|
||||||
|
|
||||||
# Print top 10 words to console
|
# Print top 10 words to console
|
||||||
print("\nTop 10 most frequent words:")
|
print("\nTop 10 most frequent words:")
|
||||||
for word, freq in top_10_words:
|
for word, freq in top_10_words:
|
||||||
print(f"{word}: {freq}")
|
print(f"{word}: {freq}")
|
||||||
|
|
||||||
# Save top 10 words to a text file
|
# Save top 10 words to a text file
|
||||||
top_words_file = path.join(output_dir, 'top_words.txt')
|
top_words_file = path.join(output_dir, 'top_words.txt')
|
||||||
with open(top_words_file, 'w', encoding='utf-8') as f:
|
with open(top_words_file, 'w', encoding='utf-8') as f:
|
||||||
f.write("Top 10 most frequent words:\n")
|
f.write("Top 10 most frequent words:\n")
|
||||||
for i, (word, freq) in enumerate(top_10_words, 1):
|
for i, (word, freq) in enumerate(top_10_words, 1):
|
||||||
f.write(f"{i}. {word}: {freq}\n")
|
f.write(f"{i}. {word}: {freq}\n")
|
||||||
|
|
||||||
print(f"\nTop words saved to {top_words_file}")
|
print(f"\nTop words saved to {top_words_file}")
|
||||||
|
|
||||||
# store default colored image
|
# store default colored image
|
||||||
default_colors = wc.to_array()
|
default_colors = wc.to_array()
|
||||||
# Display the word cloud
|
# Display the word cloud
|
||||||
plt.imshow(wc.recolor(color_func=custom_color_func, random_state=3),
|
plt.imshow(wc.recolor(color_func=custom_color_func, random_state=3),
|
||||||
interpolation="bilinear")
|
interpolation="bilinear")
|
||||||
plt.axis('off')
|
plt.axis('off')
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
# Save the word cloud to file
|
# Save the word cloud to file
|
||||||
wc.to_file(path.join(output_dir, 'wordcloud.png'))
|
wc.to_file(path.join(output_dir, 'wordcloud.png'))
|
||||||
|
|
||||||
|
# Call the async main function
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Run the async main function
|
||||||
|
asyncio.run(main())
|
||||||
|
|
17
lib/wordcloud/system_prompt.md
Normal file
17
lib/wordcloud/system_prompt.md
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
You are a bilinear Estonian-English linguist and word segmentation expert.
|
||||||
|
|
||||||
|
Your task is to identify which word or words a domain name consists of. You only work with English and Estonian words.
|
||||||
|
|
||||||
|
**Key "Language"**:
|
||||||
|
You must determine the language of the domain name. The domain name can be a single word or several words. You have 3 options: Estonian, English, Ignore.
|
||||||
|
- If the domain consists of numbers, random letters, abbreviations, personal names, or is a transliteration from another language (for example, mnogoknig.ee from Russian), you should choose "Ignore" for Language.
|
||||||
|
- If the domain consists of Estonian or English words, set the corresponding value.
|
||||||
|
|
||||||
|
**Key "is_splitted":**
|
||||||
|
Here you must specify whether the domain name consists of more than one word. Even if the domain includes an Estonian word and an abbreviation or a number, you still need to set "is_splitted" to true.
|
||||||
|
|
||||||
|
**Key "reasoning":**
|
||||||
|
Here, you should reason about which exact words and abbreviations make up the domain name. If the "Language" key is set to Ignore, simply write Ignore. If the "Language" key is either Estonian or English, then write a definition for each word, each abbreviation, and each symbol, explaining what they mean or could mean.
|
||||||
|
|
||||||
|
**Key "words":**
|
||||||
|
Based on the reasoning from the previous key, you must write only those words that make up the domain. For example, for auto24.ee, it would be "auto", "24". If the value was Ignore, then you leave the array empty.
|
|
@ -1,11 +1,11 @@
|
||||||
Top 10 most frequent words:
|
Top 10 most frequent words:
|
||||||
1. tr: 4
|
1. auto: 71
|
||||||
2. auto: 4
|
2. eesti: 62
|
||||||
3. 2-: 4
|
3. 24: 60
|
||||||
4. faktor: 4
|
4. ehitus: 40
|
||||||
5. e-: 2
|
5. shop: 33
|
||||||
6. i-: 2
|
6. rent: 33
|
||||||
7. digi: 2
|
7. pood: 28
|
||||||
8. car: 2
|
8. estonia: 26
|
||||||
9. pood: 2
|
9. tartu: 24
|
||||||
10. ai: 1
|
10. tech: 23
|
||||||
|
|
Binary file not shown.
Before Width: | Height: | Size: 351 KiB After Width: | Height: | Size: 1.1 MiB |
1
public/wordcloud/wordcloud_config_1747745307.json
Normal file
1
public/wordcloud/wordcloud_config_1747745307.json
Normal file
|
@ -0,0 +1 @@
|
||||||
|
{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","additional_prompt":null,"special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]}
|
1
public/wordcloud/wordcloud_config_1747745435.json
Normal file
1
public/wordcloud/wordcloud_config_1747745435.json
Normal file
|
@ -0,0 +1 @@
|
||||||
|
{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","additional_prompt":null,"special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]}
|
1
public/wordcloud/wordcloud_config_1747831231.json
Normal file
1
public/wordcloud/wordcloud_config_1747831231.json
Normal file
|
@ -0,0 +1 @@
|
||||||
|
{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","additional_prompt":null,"special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]}
|
1
public/wordcloud/wordcloud_config_1747907076.json
Normal file
1
public/wordcloud/wordcloud_config_1747907076.json
Normal file
|
@ -0,0 +1 @@
|
||||||
|
{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","additional_prompt":null,"special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]}
|
Loading…
Add table
Add a link
Reference in a new issue