implement async request to ai model and structured output

2025-08-17 15:03:59 +02:00 · 2025-05-27 15:32:16 +03:00 · 2025-05-27 15:32:16 +03:00 · aa4d36a0ad
commit aa4d36a0ad
parent 5b6888eb43
10 changed files with 304 additions and 194 deletions
--- a/11
+++ b/11
@ -62,6 +62,16 @@ RUN apt-get install -y --no-install-recommends > /dev/null \
  && apt-get clean \
  && rm -rf /var/lib/apt/lists/*
 # Install Python packages for wordcloud generation
 RUN apt-get update && apt-get install -y --no-install-recommends \
    python3-pip \
    python3-setuptools \
    python3-dev \
    && pip3 install --upgrade pip setuptools wheel \
    && pip3 install --no-cache-dir numpy Pillow matplotlib wordcloud openai dotenv \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*
 RUN apt-get autoremove -y && apt-get clean
 ENV CHROME_VERSION="128.0.6613.137"
@ -95,7 +105,6 @@ ENV PATH="/opt/chrome-linux64:${PATH}"
 RUN ln -s /lib/ld-linux.so.2 /lib/ld-linux.so.2 || true
 # Обертка для wkhtmltopdf с xvfb
 RUN echo '#!/bin/bash\nxvfb-run -a --server-args="-screen 0, 1024x768x24" /usr/bin/wkhtmltopdf "$@"' > /usr/local/bin/wkhtmltopdf \
    && chmod +x /usr/local/bin/wkhtmltopdf
--- a/app/jobs/generate_word_cloud_job.rb
+++ b/app/jobs/generate_word_cloud_job.rb
@ -5,6 +5,9 @@ require 'open3'
 # using an external Python script with progress tracking
 class GenerateWordCloudJob < ApplicationJob
  def perform(domains_file_path, user_id, config = {})
    Rails.logger.info("Generating wordcloud for #{domains_file_path}")
    @domains_file_path = domains_file_path
    @user_id = user_id
    @config = config
--- a/lib/wordcloud/generate_wordcloud.py
+++ b/lib/wordcloud/generate_wordcloud.py
@ -5,19 +5,42 @@ import re
 import sys
 import json
 import random
 import asyncio
 import numpy as np
 from PIL import Image
 from os import path
 from wordcloud import WordCloud, STOPWORDS
-import openai
+from openai import AsyncOpenAI
 import matplotlib.pyplot as plt
 # import pandas as pd
 from dotenv import load_dotenv
 load_dotenv()
 BATCH_SIZE = int(os.environ.get("OPENAI_BATCH_SIZE", "20"))
 def load_system_prompt():
    """Loads system prompt from system_prompt.md file"""
    prompt_file = path.join(path.dirname(__file__), 'system_prompt.md')
    if not path.exists(prompt_file):
        raise FileNotFoundError(f"System prompt not found at {prompt_file}. Please create the file.")
    with open(prompt_file, 'r', encoding='utf-8') as f:
        system_prompt = f.read()
    return system_prompt
 d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
 output_dir = sys.argv[2] if len(sys.argv) > 2 else d
 try:
    SYSTEM_PROMPT = load_system_prompt()
    print("System prompt successfully loaded from file.")
 except FileNotFoundError as e:
    print(f"Error: {e}")
    sys.exit(1)
 # Load configuration if provided
 config = {}
 if len(sys.argv) > 3 and sys.argv[3]:
@ -45,29 +68,31 @@ if not domain_names:
    print("Error: No domain names found in the provided file")
    sys.exit(1)
 # Get special terms from config or use defaults
 SPECIAL_TERMS = config.get('special_terms', ['e-', 'i-', '2-', '3-', '4-', '.com', 'tr.ee', 'ai', 'web'])
 print(f"Using special terms: {SPECIAL_TERMS}")
-# Get batch size from config or use default
+# Function to extract words using OpenAI API asynchronously
-BATCH_SIZE = int(config.get('batch_size', 500))
+async def extract_words_with_openai(domain_names, batch_size=BATCH_SIZE):
-print(f"Using batch size: {BATCH_SIZE}")
+    filtered_domains = []
    # Filter out domains that are only numbers
    for domain in domain_names:
        domain_core = domain.lower().replace('www.', '')
        main_part = domain_core.split('.')[0]
        if not main_part.isdigit():
            filtered_domains.append(domain)
 # Get additional prompt from config or use default
 ADDITIONAL_PROMPT = config.get('additional_prompt', None)
 print(f"Using additional prompt: {ADDITIONAL_PROMPT}")
 # Function to extract words using OpenAI API
 def extract_words_with_openai(domain_names, special_terms, batch_size=BATCH_SIZE, additional_prompt=ADDITIONAL_PROMPT):
    # Get API key from environment variable
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("OpenAI API key not found. Set the OPENAI_API_KEY environment variable.")
    # Initialize AsyncOpenAI client
    client = AsyncOpenAI(api_key=api_key)
    # Get model and temperature from environment variables
    model = os.environ.get("OPENAI_MODEL", "gpt-4.1-2025-04-14")
-    temperature = float(os.environ.get("OPENAI_TEMPERATURE", "0.3"))
+    temperature = float(os.environ.get("OPENAI_TEMPERATURE", "0"))
-    max_tokens = int(os.environ.get("OPENAI_MAX_TOKENS", "2000"))
+    max_tokens = int(os.environ.get("OPENAI_MAX_TOKENS", "16000"))
    # Process domains in batches
    all_words = []
@ -76,49 +101,86 @@ def extract_words_with_openai(domain_names, special_terms, batch_size=BATCH_SIZE
    total_cost = 0
    # Calculate number of batches
-    num_batches = (len(domain_names) + batch_size - 1) // batch_size
+    num_batches = (len(filtered_domains) + batch_size - 1) // batch_size
-    for i in range(0, len(domain_names), batch_size):
+    # Create semaphore to limit concurrent requests
-        batch = domain_names[i:i+batch_size]
+    semaphore = asyncio.Semaphore(8)  # Limit to 5 concurrent requests
-        print(f"Processing batch {i//batch_size + 1}/{num_batches} ({len(batch)} domains)...")
+    
    async def process_batch(batch_idx):
        async with semaphore:
            start_idx = batch_idx * batch_size
            end_idx = min(start_idx + batch_size, len(filtered_domains))
            batch = filtered_domains[start_idx:end_idx]
            print(f"Processing batch {batch_idx + 1}/{num_batches} ({len(batch)} domains)...")
            sys.stdout.flush()
            # Prepare the prompt with domain names and special terms
            domains_text = "\n".join(batch)
-        special_terms_text = ", ".join([f"`{term}`" for term in special_terms])
+            prompt = f"List of domain names: {domains_text}"
        prompt = f"""You are a bilingual Estonian-English linguist and word segmentation expert. I will give you a list of .ee domain names.
 Your task is to extract a clean list of words for word cloud generation.
 Follow these rules strictly:
 1. Before doing anything else, always extract and separate these predefined special terms if they appear as prefixes or parts of the domain name: {special_terms_text}. Keep symbols and numbers as they are. For example, if the domain name is `e-robot.ee`, the output should be `e- robot`. Remove extensions from the special terms.
 2. If a word contains a number (e.g., `auto24`), separate the number and the word: `auto`, `24`.
 3. If the domain name is a compound of 2+ Estonian or English words (e.g., `virtuaalabiline` or `doorkeeper`), intelligently split them into individual meaningful components. Prioritize Estonian words over English words.
 4. Keep all resulting words in lowercase and remove the `.ee` extension from all the words
 5. Try to find the most common words and phrases in the domain names.
 6. Return ONLY a space-separated list of words and numberswith no explanations, no formatting, no introductions, and no additional text.
 {additional_prompt}
 Example output format:
 word1 word2 word3 word4 word5
 Here are the domain names:
 {domains_text}
 """
            # Make the API call
            try:
                print(f"Using model: {model} with temperature: {temperature}")
-            response = openai.chat.completions.create(
+                response = await client.chat.completions.create(
                    model=model,
                    messages=[
-                    {"role": "system", "content": "You are a helpful assistant that extracts words from domain names. You ONLY output the extracted words with no additional text."},
+                        {"role": "system", "content": SYSTEM_PROMPT},
                        {"role": "user", "content": prompt}
                    ],
                    response_format={
                        "type": "json_schema",
                        "json_schema": {
                        "name": "domain_analysis_results",
                        "strict": True,
                        "schema": {
                            "type": "object",
                            "properties": {
                            "results": {
                                "type": "array",
                                "description": "A list of analysis results for the provided domains.",
                                "items": {
                                "type": "object",
                                "properties": {
                                    "Language": {
                                    "type": "string",
                                    "description": "The language identified in the domain name."
                                    },
                                    "is_splitted": {
                                    "type": "string",
                                    "description": "Indicates whether the domain name is split into recognizable words."
                                    },
                                    "reasoning": {
                                    "type": "string",
                                    "description": "Explanation of the reasoning behind the language and word identification."
                                    },
                                    "words": {
                                    "type": "array",
                                    "description": "The words identified in the domain name.",
                                    "items": {
                                        "type": "string"
                                    }
                                    }
                                },
                                "required": [
                                    "Language",
                                    "is_splitted",
                                    "reasoning",
                                    "words"
                                ],
                                "additionalProperties": False
                                }
                            }
                            },
                            "required": [
                            "results"
                            ],
                            "additionalProperties": False
                        }
                        }
                    },
                    temperature=temperature,
-                max_tokens=max_tokens
+                    max_tokens=max_tokens,
                )
                # Track token usage
@ -126,6 +188,7 @@ Here are the domain names:
                completion_tokens = response.usage.completion_tokens
                total_tokens = response.usage.total_tokens
                nonlocal total_prompt_tokens, total_completion_tokens
                total_prompt_tokens += prompt_tokens
                total_completion_tokens += completion_tokens
@ -140,42 +203,51 @@ Here are the domain names:
                    completion_cost = 0
                batch_cost = prompt_cost + completion_cost
                nonlocal total_cost
                total_cost += batch_cost
                print(f"Estimated batch cost: ${batch_cost:.6f}")
                # Extract the words from the response
-            words_text = response.choices[0].message.content.strip()
+                response_json = json.loads(response.choices[0].message.content)
            # Process the response to get a clean list of words
                batch_words = []
-            for line in words_text.split('\n'):
+                for result in response_json['results']:
-                line = line.strip()
+                    if result['Language'] == 'Ignore':
-                if line and not line.startswith('```') and not line.endswith('```'):
+                        continue
-                    # Remove any list markers like "1. ", "- ", etc.
+                    batch_words.extend(result['words'])
                    cleaned_line = re.sub(r'^[\d\-\*\•\.\s]+', '', line)
                    if cleaned_line:
                        batch_words.extend(cleaned_line.split())
            all_words.extend(batch_words)
                print(f"Extracted {len(batch_words)} words from this batch")
                return batch_words
            except Exception as e:
                print(f"Error calling OpenAI API for batch: {e}")
                return []
    # Create tasks for each batch
    tasks = []
    for batch_idx in range(num_batches):
        tasks.append(process_batch(batch_idx))
    # Run all tasks concurrently and wait for results
    batch_results = await asyncio.gather(*tasks)
    # Combine all words from all batches
    for batch_words in batch_results:
        all_words.extend(batch_words)
    print(f"Total token usage - Prompt: {total_prompt_tokens}, Completion: {total_completion_tokens}")
    print(f"Total estimated cost: ${total_cost:.6f}")
    return all_words
 # Replace the synchronous call with an async function
 async def main():
    # Process domain names using OpenAI
    print("Extracting words from domain names using OpenAI...")
-extracted_words = extract_words_with_openai(domain_names, SPECIAL_TERMS)
+    extracted_words = await extract_words_with_openai(domain_names)
    print(f"Extracted {len(extracted_words)} words")
 # print("Sample of extracted words:", extracted_words)
    # Join the extracted words for the word cloud
    processed_text = ' '.join(extracted_words)
 # print("Processed text sample:", processed_text)
    def custom_color_func(word, font_size, position, orientation, random_state=None,
                        **kwargs):
@ -272,3 +344,8 @@ plt.show()
    # Save the word cloud to file
    wc.to_file(path.join(output_dir, 'wordcloud.png'))
 # Call the async main function
 if __name__ == "__main__":
    # Run the async main function
    asyncio.run(main())
--- a/lib/wordcloud/system_prompt.md
+++ b/lib/wordcloud/system_prompt.md
@ -0,0 +1,17 @@
    You are a bilinear Estonian-English linguist and word segmentation expert.
    Your task is to identify which word or words a domain name consists of. You only work with English and Estonian words.
    **Key "Language"**:
    You must determine the language of the domain name. The domain name can be a single word or several words. You have 3 options: Estonian, English, Ignore.
    - If the domain consists of numbers, random letters, abbreviations, personal names, or is a transliteration from another language (for example, mnogoknig.ee from Russian), you should choose "Ignore" for Language.
    - If the domain consists of Estonian or English words, set the corresponding value.
    **Key "is_splitted":**
    Here you must specify whether the domain name consists of more than one word. Even if the domain includes an Estonian word and an abbreviation or a number, you still need to set "is_splitted" to true.
    **Key "reasoning":**
    Here, you should reason about which exact words and abbreviations make up the domain name. If the "Language" key is set to Ignore, simply write Ignore. If the "Language" key is either Estonian or English, then write a definition for each word, each abbreviation, and each symbol, explaining what they mean or could mean.
    **Key "words":**
    Based on the reasoning from the previous key, you must write only those words that make up the domain. For example, for auto24.ee, it would be "auto", "24". If the value was Ignore, then you leave the array empty.
--- a/public/wordcloud/top_words.txt
+++ b/public/wordcloud/top_words.txt
@ -1,11 +1,11 @@
 Top 10 most frequent words:
-1. tr: 4
+1. auto: 71
-2. auto: 4
+2. eesti: 62
-3. 2-: 4
+3. 24: 60
-4. faktor: 4
+4. ehitus: 40
-5. e-: 2
+5. shop: 33
-6. i-: 2
+6. rent: 33
-7. digi: 2
+7. pood: 28
-8. car: 2
+8. estonia: 26
-9. pood: 2
+9. tartu: 24
-10. ai: 1
+10. tech: 23
--- a/public/wordcloud/wordcloud.png
+++ b/public/wordcloud/wordcloud.png
--- a/public/wordcloud/wordcloud_config_1747745307.json
+++ b/public/wordcloud/wordcloud_config_1747745307.json
@ -0,0 +1 @@
 {"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","additional_prompt":null,"special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]}
--- a/public/wordcloud/wordcloud_config_1747745435.json
+++ b/public/wordcloud/wordcloud_config_1747745435.json
@ -0,0 +1 @@
 {"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","additional_prompt":null,"special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]}
--- a/public/wordcloud/wordcloud_config_1747831231.json
+++ b/public/wordcloud/wordcloud_config_1747831231.json
@ -0,0 +1 @@
 {"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","additional_prompt":null,"special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]}
--- a/public/wordcloud/wordcloud_config_1747907076.json
+++ b/public/wordcloud/wordcloud_config_1747907076.json
@ -0,0 +1 @@
 {"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","additional_prompt":null,"special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]}
		`@ -0,0 +1 @@`
							`{"width":"800","height":"800","max_words":"500","background_color":"white","min_word_length":"2","include_numbers":true,"batch_size":"500","additional_prompt":null,"special_terms":["e-","i-","2-","3-","4-",".com","tr.ee","ai","web"]}`