il y a 3 mois · 9f9594b185
--- a/main.py
+++ b/main.py
@@ -1,236 +0,0 @@
 
				-import os
			
 
				-import re
			
 
				-from openai import OpenAI
			
 
				-from dotenv import load_dotenv
			
 
				-import requests
			
 
				-from bs4 import BeautifulSoup
			
 
				-import json
			
 
				-import argparse
			
 
				-from rich.console import Console
			
 
				-from rich.markdown import Markdown
			
 
				-from multiprocessing import Pool
			
 
				-import multiprocessing as mp
			
 
				-
			
 
				-
			
 
				-def duckduckgo_search(query, num_results=5):
			
 
				-    # Construct the DuckDuckGo URL for the search query
			
 
				-    url = f"https://html.duckduckgo.com/html/?q={query}"
			
 
				-
			
 
				-    # Send a GET request to the DuckDuckGo search page
			
 
				-    headers = {
			
 
				-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
			
 
				-    }
			
 
				-
			
 
				-    response = requests.get(url, headers=headers)
			
 
				-
			
 
				-    # Check if the request was successful
			
 
				-    if response.status_code != 200:
			
 
				-        print(
			
 
				-            f"Failed to retrieve search results. Status code: {response.status_code}")
			
 
				-        return []
			
 
				-
			
 
				-    # Parse the HTML content using BeautifulSoup
			
 
				-    soup = BeautifulSoup(response.content, 'html.parser')
			
 
				-
			
 
				-    # Find all result links (assuming they are in <a> tags with class "result__a")
			
 
				-    result_links = []
			
 
				-    for a_tag in soup.find_all('a', class_='result__a'):
			
 
				-        link = a_tag.get('href')
			
 
				-        if link:
			
 
				-            result_links.append(link)
			
 
				-            if len(result_links) >= num_results:
			
 
				-                break
			
 
				-
			
 
				-    return result_links
			
 
				-
			
 
				-
			
 
				-def extract_text_from_links(links, timeout=5) -> list[tuple[str, str]]:
			
 
				-    extracted_texts = []
			
 
				-    headers = {
			
 
				-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
			
 
				-    }
			
 
				-
			
 
				-    for link in links:
			
 
				-        try:
			
 
				-            response = requests.get(link, headers=headers, timeout=timeout)
			
 
				-            if response.status_code == 200:
			
 
				-                soup = BeautifulSoup(response.content, 'html.parser')
			
 
				-                # Extract text from the page
			
 
				-                text = soup.get_text(separator='\n', strip=True)
			
 
				-                extracted_texts.append((link, text))
			
 
				-            else:
			
 
				-                print(
			
 
				-                    f"Failed to retrieve content from {link}. Status code: {response.status_code}")
			
 
				-        except requests.RequestException as e:
			
 
				-            print(f"An error occurred while fetching {link}: {e}")
			
 
				-
			
 
				-    return extracted_texts
			
 
				-
			
 
				-def remove_tags(text):        
			
 
				-    # Regular expression pattern to match '<think>' tags and their contents
			
 
				-    pattern = r'<think>[\s\S]*?<\/think>\n\n'
			
 
				-    # Replace all matches with an empty string
			
 
				-    result = re.sub(pattern, '', text)
			
 
				-    return result
			
 
				-
			
 
				-def process_url(args):
			
 
				-    "Helper function to summarize one individual source"
			
 
				-    url, text, query, model, api_base, token = args
			
 
				-    
			
 
				-    client = OpenAI(
			
 
				-        base_url=api_base,
			
 
				-        api_key=token
			
 
				-    )
			
 
				-
			
 
				-    prompt = f"""You are an expert summarizer. Your task is to evaluate which parts if any of the
			
 
				-    following document are relevant to the user's query. Summarize the relevant parts of the given text with regards to the original query: '{query}'\
			
 
				-    and include the full URLs as references where appropriate. Leave out everything that has little or no relevance to the query. If the text does not 
			
 
				-    contain relevant information just return an empty response.
			
 
				-    \n\n{text}"""
			
 
				-
			
 
				-    history = [{"role": "user", "content": prompt}]
			
 
				-
			
 
				-    try:
			
 
				-        response = client.chat.completions.create(
			
 
				-            model=model,
			
 
				-            messages=history,
			
 
				-            temperature=0,
			
 
				-            max_tokens=1000
			
 
				-        ).choices.pop().message.content        
			
 
				-        return (url, remove_tags(response))
			
 
				-    except Exception as e:
			
 
				-        print(
			
 
				-            f"An error occurred at summarization for {url}: {e}"
			
 
				-        )
			
 
				-        return (url, "")  # Return empty string on error
			
 
				-
			
 
				-
			
 
				-def summarize_individual_texts(texts_and_urls, query, model, api_base, token):
			
 
				-    # Generate text summaries in parallel using multiprocessing
			
 
				-    args_list = [(url, text, query, model, api_base, token) for url, text in texts_and_urls]
			
 
				-    
			
 
				-    # Get number of CPUs to use
			
 
				-    num_processes = mp.cpu_count()
			
 
				-
			
 
				-    # Create a process pool and process URLs in parallel
			
 
				-    with Pool(processes=num_processes) as pool:
			
 
				-        summaries = pool.map(process_url, args_list)
			
 
				-    
			
 
				-    return summaries
			
 
				-
			
 
				-
			
 
				-def summarize(texts_and_urls, query, model, api_base, token):
			
 
				-    # Prepare the context and prompt
			
 
				-    context = "\n".join(
			
 
				-        [f"URL: {url}\nText: {text}" for url, text in texts_and_urls])
			
 
				-    prompt = f"""You are an expert summarizer. Your task is to evaluate which parts if any of the
			
 
				-        following document are relevant to the user's query. Summarize the relevant parts of the given text with regards to the original query: '{query}' \
			
 
				-        and include the full URLs as references where appropriate. Use markdown to format your response. Add unicode characters where it makes sense to make the summary colorful. \
			
 
				-        \n\n{context}"""
			
 
				-
			
 
				-    client = OpenAI(
			
 
				-        base_url=api_base,
			
 
				-        api_key=token
			
 
				-    )
			
 
				-
			
 
				-    history = [{"role": "user", "content": prompt}]
			
 
				-
			
 
				-    try:
			
 
				-        response = client.chat.completions.create(
			
 
				-            model=model,
			
 
				-            messages=history,
			
 
				-            temperature=0,
			
 
				-            max_tokens=2000
			
 
				-        ).choices.pop().message.content
			
 
				-        return remove_tags(response)
			
 
				-    except Exception as e:
			
 
				-        print(
			
 
				-            f"An error occurred at summarization for {url}: {e}"
			
 
				-        )
			
 
				-
			
 
				-
			
 
				-def optimize_search_query(query, query_model, api_base):
			
 
				-    # Prepare the prompt for optimizing the search query
			
 
				-    prompt = f"Optimize the following natural language query to improve its effectiveness in a web search.\
			
 
				-        Make it very concise. Return only exactly the optimized query text no additional texts, quotations or thoughts. Query: '{query}'"
			
 
				-
			
 
				-    # Create the payload for the POST request
			
 
				-    payload = {
			
 
				-        "model": query_model,
			
 
				-        "prompt": prompt,
			
 
				-        "stream": False,
			
 
				-        "max_tokens": 50
			
 
				-    }
			
 
				-
			
 
				-    # Send the POST request to the server
			
 
				-    try:
			
 
				-        print("Optimizing search query")
			
 
				-        response = requests.post(api_base, json=payload)
			
 
				-        if response.status_code == 200:
			
 
				-            result = json.loads(response.text)
			
 
				-            return (result["choices"][0]["text"].strip())
			
 
				-        else:
			
 
				-            print(
			
 
				-                f"Failed to optimize search query from the server. Status code: {response.status_code}")
			
 
				-            return query
			
 
				-    except Exception as e:
			
 
				-        print(
			
 
				-            f"An error occurred while sending request to the server for optimizing the search query: {e}")
			
 
				-        return query
			
 
				-
			
 
				-
			
 
				-def pretty_print_markdown(markdown_text):
			
 
				-    console = Console()
			
 
				-    md = Markdown(markdown_text)
			
 
				-    console.print(md)
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    load_dotenv()
			
 
				-    token = os.getenv("OVH_AI_ENDPOINTS_ACCESS_TOKEN")
			
 
				-    api_base = os.getenv("API_BASE")    
			
 
				-    summary_model = os.getenv("SUMMARY_MODEL")
			
 
				-
			
 
				-    # Set up argument parser
			
 
				-    parser = argparse.ArgumentParser(
			
 
				-        description="Search DuckDuckGo, extract text from results, and summarize with LLM.")
			
 
				-    parser.add_argument("query", type=str,
			
 
				-                        help="The search query to use on DuckDuckGo")
			
 
				-    parser.add_argument("--num_results", type=int, default=5,
			
 
				-                        help="Number of search results to process (default: 5)")
			
 
				-
			
 
				-    # Parse arguments
			
 
				-    args = parser.parse_args()
			
 
				-
			
 
				-    
			
 
				-    original_query = args.query
			
 
				-    
			
 
				-    print(f"Query: {original_query}")
			
 
				-
			
 
				-    n = args.num_results  # Number of results to extract
			
 
				-    links = duckduckgo_search(original_query, n)
			
 
				-
			
 
				-    print(f"Top {n} search results:")
			
 
				-    for i, link in enumerate(links, start=1):
			
 
				-        print(f"{i}. {link}")
			
 
				-
			
 
				-    texts_and_urls = extract_text_from_links(links)
			
 
				-
			
 
				-    print("Summarizing individual search results")
			
 
				-    intermediate_summaries = summarize_individual_texts(
			
 
				-        texts_and_urls,
			
 
				-        original_query,
			
 
				-        summary_model,
			
 
				-        api_base,
			
 
				-        token
			
 
				-    )
			
 
				-
			
 
				-    final_summary = summarize(
			
 
				-        intermediate_summaries,
			
 
				-        original_query,
			
 
				-        summary_model,
			
 
				-        api_base,
			
 
				-        token)
			
 
				-
			
 
				-    if final_summary:
			
 
				-        pretty_print_markdown(final_summary)