4 months ago · 94354d57d3
--- a/main.py
+++ b/main.py
@@ -1,29 +1,37 @@
 
															+import os
														
 
															+import re
														
 
															+from openai import OpenAI
														
 
															+from dotenv import load_dotenv
														
 
															 import requests
														
 
															 from bs4 import BeautifulSoup
														
 
															 import json
														
 
															 import argparse
														
 
															 from rich.console import Console
														
 
															 from rich.markdown import Markdown
														
 
															+from multiprocessing import Pool
														
 
															+import multiprocessing as mp
														
 
															+
														
 
															 def duckduckgo_search(query, num_results=5):
														
 
															     # Construct the DuckDuckGo URL for the search query
														
 
															     url = f"https://html.duckduckgo.com/html/?q={query}"
														
 
															-    
														
 
															+
														
 
															     # Send a GET request to the DuckDuckGo search page
														
 
															     headers = {
														
 
															         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
														
 
															     }
														
 
															-    
														
 
															+
														
 
															     response = requests.get(url, headers=headers)
														
 
															-    
														
 
															+
														
 
															     # Check if the request was successful
														
 
															     if response.status_code != 200:
														
 
															-        print(f"Failed to retrieve search results. Status code: {response.status_code}")
														
 
															+        print(
														
 
															+            f"Failed to retrieve search results. Status code: {response.status_code}")
														
 
															         return []
														
 
															-    
														
 
															+
														
 
															     # Parse the HTML content using BeautifulSoup
														
 
															     soup = BeautifulSoup(response.content, 'html.parser')
														
 
															-    
														
 
															+
														
 
															     # Find all result links (assuming they are in <a> tags with class "result__a")
														
 
															     result_links = []
														
 
															     for a_tag in soup.find_all('a', class_='result__a'):
														
@@ -32,17 +40,17 @@ def duckduckgo_search(query, num_results=5):
 
															             result_links.append(link)
														
 
															             if len(result_links) >= num_results:
														
 
															                 break
														
 
															-    
														
 
															+
														
 
															     return result_links
														
 
															-def extract_text_from_links(links, timeout=5):
														
 
															+def extract_text_from_links(links, timeout=5) -> list[tuple[str, str]]:
														
 
															     extracted_texts = []
														
 
															     headers = {
														
 
															         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
														
 
															     }
														
 
															-    
														
 
															-    for link in links:        
														
 
															+
														
 
															+    for link in links:
														
 
															         try:
														
 
															             response = requests.get(link, headers=headers, timeout=timeout)
														
 
															             if response.status_code == 200:
														
@@ -51,104 +59,123 @@ def extract_text_from_links(links, timeout=5):
 
															                 text = soup.get_text(separator='\n', strip=True)
														
 
															                 extracted_texts.append((link, text))
														
 
															             else:
														
 
															-                print(f"Failed to retrieve content from {link}. Status code: {response.status_code}")
														
 
															+                print(
														
 
															+                    f"Failed to retrieve content from {link}. Status code: {response.status_code}")
														
 
															         except requests.RequestException as e:
														
 
															             print(f"An error occurred while fetching {link}: {e}")
														
 
															-    
														
 
															+
														
 
															     return extracted_texts
														
 
															+def remove_tags(text):        
														
 
															+    # Regular expression pattern to match '<think>' tags and their contents
														
 
															+    pattern = r'<think>[\s\S]*?<\/think>\n\n'
														
 
															+    # Replace all matches with an empty string
														
 
															+    result = re.sub(pattern, '', text)
														
 
															+    return result
														
 
															-def summarize_individual_texts(texts_and_urls, query, model, api_url="http://localhost:8000/api/v1/completions"):
														
 
															-    summaries = []
														
 
															-    for url, text in texts_and_urls:
														
 
															-        prompt = f"Extract the relevant information from the following text with regards to the original \
														
 
															-        query: '{query}'\n\n{text}\n"
														
 
															-        payload = {
														
 
															-            "model": model,
														
 
															-            "prompt": prompt,
														
 
															-            "stream": False,
														
 
															-            "max_tokens": 1000,
														
 
															-	    "options": {
														
 
															-		"num_ctx": 16384
														
 
															-	    }	
														
 
															-        }
														
 
															-        
														
 
															-        try:
														
 
															-            response = requests.post(api_url, json=payload)
														
 
															-            if response.status_code == 200:
														
 
															-                #result = json.loads(response.text)["response"]
														
 
															-                result_json = json.loads(response.text)
														
 
															-                print(result_json)
														
 
															-                result = result_json["choices"][0]["text"].strip()                
														
 
															-                summaries.append((url, result))
														
 
															-            else:
														
 
															-                print(f"Failed to get summary from server for {url}. Status code: {response.status_code}")
														
 
															-        except requests.RequestException as e:
														
 
															-            print(f"An error occurred while sending request to server for {url}: {e}")
														
 
															+def process_url(args):
														
 
															+    "Helper function to summarize one individual source"
														
 
															+    url, text, query, model, api_base, token = args
														
 
															+    
														
 
															+    client = OpenAI(
														
 
															+        base_url=api_base,
														
 
															+        api_key=token
														
 
															+    )
														
 
															+
														
 
															+    prompt = f"""You are an expert summarizer. Your task is to evaluate which parts if any of the
														
 
															+    following document are relevant to the user's query. Summarize the relevant parts of the given text with regards to the original query: '{query}'\
														
 
															+    and include the full URLs as references where appropriate. Leave out everything that has little or no relevance to the query. If the text does not 
														
 
															+    contain relevant information just return an empty response.
														
 
															+    \n\n{text}"""
														
 
															+
														
 
															+    history = [{"role": "user", "content": prompt}]
														
 
															+
														
 
															+    try:
														
 
															+        response = client.chat.completions.create(
														
 
															+            model=model,
														
 
															+            messages=history,
														
 
															+            temperature=0,
														
 
															+            max_tokens=1000
														
 
															+        ).choices.pop().message.content        
														
 
															+        return (url, remove_tags(response))
														
 
															+    except requests.RequestException as e:
														
 
															+        print(
														
 
															+            f"An error occurred at summarization for {url}: {e}"
														
 
															+        )
														
 
															+        return (url, "")  # Return empty string on error
														
 
															+
														
 
															+
														
 
															+def summarize_individual_texts(texts_and_urls, query, model, api_base, token):
														
 
															+    # Generate text summaries in parallel using multiprocessing
														
 
															+    args_list = [(url, text, query, model, api_base, token) for url, text in texts_and_urls]
														
 
															+    
														
 
															+    # Get number of CPUs to use
														
 
															+    num_processes = mp.cpu_count()
														
 
															+
														
 
															+    # Create a process pool and process URLs in parallel
														
 
															+    with Pool(processes=num_processes) as pool:
														
 
															+        summaries = pool.map(process_url, args_list)
														
 
															     return summaries
														
 
															-def summarize(texts_and_urls, query, model, api_url="http://localhost:8000/api/v1/completions"):
														
 
															+def summarize(texts_and_urls, query, model, api_base, token):
														
 
															     # Prepare the context and prompt
														
 
															-    context = "\n".join([f"URL: {url}\nText: {text}" for url, text in texts_and_urls])
														
 
															-    prompt = f"Summarize the following search results with regards to the original query: '{query}' \
														
 
															+    context = "\n".join(
														
 
															+        [f"URL: {url}\nText: {text}" for url, text in texts_and_urls])
														
 
															+    prompt = f"""You are an expert summarizer. Your task is to evaluate which parts if any of the
														
 
															+        following document are relevant to the user's query. Summarize the relevant parts of the given text with regards to the original query: '{query}' \
														
 
															         and include the full URLs as references where appropriate. Use markdown to format your response. Add unicode characters where it makes sense to make the summary colorful. \
														
 
															-        \n\n{context}"
														
 
															-    
														
 
															-    # Create the payload for the POST request
														
 
															-    payload = {        
														
 
															-        "model": model,
														
 
															-        "prompt": prompt,
														
 
															-        "stream": False,
														
 
															-        # "max_tokens": 1500,
														
 
															-        "options": {
														
 
															-	    "num_ctx": 16384
														
 
															-	}
														
 
															-    }
														
 
															-    
														
 
															-    # Send the POST request to the server
														
 
															+        \n\n{context}"""
														
 
															+
														
 
															+    client = OpenAI(
														
 
															+        base_url=api_base,
														
 
															+        api_key=token
														
 
															+    )
														
 
															+
														
 
															+    history = [{"role": "user", "content": prompt}]
														
 
															+
														
 
															     try:
														
 
															-        print("Processing")
														
 
															-        response = requests.post(api_url, json=payload)
														
 
															-        if response.status_code == 200:
														
 
															-            #result = json.loads(response.text)["response"]
														
 
															-            #return result            
														
 
															-            result = json.loads(response.text)            
														
 
															-            return(result["choices"][0]["text"].strip())
														
 
															-        else:
														
 
															-            print(f"Failed to get summary from the server. Status code: {response.status_code}")
														
 
															-            return None
														
 
															+        response = client.chat.completions.create(
														
 
															+            model=model,
														
 
															+            messages=history,
														
 
															+            temperature=0,
														
 
															+            max_tokens=2000
														
 
															+        ).choices.pop().message.content
														
 
															+        return remove_tags(response)
														
 
															     except requests.RequestException as e:
														
 
															-        print(f"An error occurred while sending request to the server: {e}")
														
 
															-        return None
														
 
															+        print(
														
 
															+            f"An error occurred at summarization for {url}: {e}"
														
 
															+        )
														
 
															-def optimize_search_query(query, query_model, api_url="http://localhost:8000/api/v1/completions"):
														
 
															+def optimize_search_query(query, query_model, api_base):
														
 
															     # Prepare the prompt for optimizing the search query
														
 
															     prompt = f"Optimize the following natural language query to improve its effectiveness in a web search.\
														
 
															         Make it very concise. Return only the optimized query text no additional texts, quotations or thoughts. Query: '{query}'"
														
 
															-    
														
 
															+
														
 
															     # Create the payload for the POST request
														
 
															-    payload = {        
														
 
															+    payload = {
														
 
															         "model": query_model,
														
 
															         "prompt": prompt,
														
 
															         "stream": False,
														
 
															         "max_tokens": 50
														
 
															     }
														
 
															-    
														
 
															+
														
 
															     # Send the POST request to the server
														
 
															     try:
														
 
															         print("Optimizing search query")
														
 
															-        response = requests.post(api_url, json=payload)
														
 
															+        response = requests.post(api_base, json=payload)
														
 
															         if response.status_code == 200:
														
 
															-            result = json.loads(response.text)            
														
 
															-            return(result["choices"][0]["text"].strip())
														
 
															+            result = json.loads(response.text)
														
 
															+            return (result["choices"][0]["text"].strip())
														
 
															         else:
														
 
															-            print(f"Failed to optimize search query from the server. Status code: {response.status_code}")
														
 
															+            print(
														
 
															+                f"Failed to optimize search query from the server. Status code: {response.status_code}")
														
 
															             return query
														
 
															     except requests.RequestException as e:
														
 
															-        print(f"An error occurred while sending request to the server for optimizing the search query: {e}")
														
 
															+        print(
														
 
															+            f"An error occurred while sending request to the server for optimizing the search query: {e}")
														
 
															         return query
														
@@ -156,43 +183,54 @@ def pretty_print_markdown(markdown_text):
 
															     console = Console()
														
 
															     md = Markdown(markdown_text)
														
 
															     console.print(md)
														
 
															-    
														
 
															+
														
 
															 if __name__ == "__main__":
														
 
															+    load_dotenv()
														
 
															+    token = os.getenv("OVH_AI_ENDPOINTS_ACCESS_TOKEN")
														
 
															+
														
 
															     # Set up argument parser
														
 
															-    parser = argparse.ArgumentParser(description="Search DuckDuckGo, extract text from results, and summarize with LLM.")
														
 
															-    parser.add_argument("query", type=str, help="The search query to use on DuckDuckGo")
														
 
															-    parser.add_argument("--num_results", type=int, default=5, help="Number of search results to process (default: 5)")
														
 
															-    
														
 
															+    parser = argparse.ArgumentParser(
														
 
															+        description="Search DuckDuckGo, extract text from results, and summarize with LLM.")
														
 
															+    parser.add_argument("query", type=str,
														
 
															+                        help="The search query to use on DuckDuckGo")
														
 
															+    parser.add_argument("--num_results", type=int, default=5,
														
 
															+                        help="Number of search results to process (default: 5)")
														
 
															+
														
 
															     # Parse arguments
														
 
															     args = parser.parse_args()
														
 
															-    
														
 
															+
														
 
															+    api_base = "https://oai.endpoints.kepler.ai.cloud.ovh.net/v1/"
														
 
															     original_query = args.query
														
 
															-    query_model = "Gemma-3-4b-it-GGUF"    
														
 
															-    #query_model = "Qwen3-1.7B-GGUF"
														
 
															-    #summary_model = "Qwen3-4B-Instruct-2507-GGUF"
														
 
															-    summary_model = "Qwen3-1.7B-GGUF"
														
 
															-    #final_summary_model = "Gemma-3-4b-it-GGUF"
														
 
															-    final_summary_model = "Qwen3-1.7B-GGUF"
														
 
															-
														
 
															-    # Optimize the search query
														
 
															-    optimized_query = optimize_search_query(original_query, query_model)
														
 
															-    print(f"Original Query: {original_query}")
														
 
															-    print(f"Optimized Query: {optimized_query}")
														
 
															+    summary_model = "Qwen3-32B"
														
 
															+
														
 
															+    print(f"Query: {original_query}")
														
 
															     n = args.num_results  # Number of results to extract
														
 
															-    links = duckduckgo_search(optimized_query, n)
														
 
															-    
														
 
															+    links = duckduckgo_search(original_query, n)
														
 
															+
														
 
															     print(f"Top {n} search results:")
														
 
															     for i, link in enumerate(links, start=1):
														
 
															         print(f"{i}. {link}")
														
 
															-    
														
 
															-    texts_and_urls = extract_text_from_links(links)    
														
 
															-    
														
 
															+
														
 
															+    texts_and_urls = extract_text_from_links(links)
														
 
															+
														
 
															     print("Summarizing individual search results")
														
 
															-    intermediate_summaries = summarize_individual_texts(texts_and_urls, original_query, summary_model)
														
 
															-    final_summary = summarize(intermediate_summaries, original_query, final_summary_model)
														
 
															-    
														
 
															+    intermediate_summaries = summarize_individual_texts(
														
 
															+        texts_and_urls,
														
 
															+        original_query,
														
 
															+        summary_model,
														
 
															+        api_base,
														
 
															+        token
														
 
															+    )
														
 
															+
														
 
															+    final_summary = summarize(
														
 
															+        intermediate_summaries,
														
 
															+        original_query,
														
 
															+        summary_model,
														
 
															+        api_base,
														
 
															+        token)
														
 
															+
														
 
															     if final_summary:
														
 
															-        print("\nFinal Summary of search results:\n")
														
 
															+        print("\n################################# Final Summary of search results ################################# \n")        
														
 
															         pretty_print_markdown(final_summary)