Browse Source

use ovhcloud

david 1 month ago
parent
commit
94354d57d3
1 changed files with 141 additions and 103 deletions
  1. 141 103
      main.py

+ 141 - 103
main.py

@@ -1,29 +1,37 @@
+import os
+import re
+from openai import OpenAI
+from dotenv import load_dotenv
 import requests
 import requests
 from bs4 import BeautifulSoup
 from bs4 import BeautifulSoup
 import json
 import json
 import argparse
 import argparse
 from rich.console import Console
 from rich.console import Console
 from rich.markdown import Markdown
 from rich.markdown import Markdown
+from multiprocessing import Pool
+import multiprocessing as mp
+
 
 
 def duckduckgo_search(query, num_results=5):
 def duckduckgo_search(query, num_results=5):
     # Construct the DuckDuckGo URL for the search query
     # Construct the DuckDuckGo URL for the search query
     url = f"https://html.duckduckgo.com/html/?q={query}"
     url = f"https://html.duckduckgo.com/html/?q={query}"
-    
+
     # Send a GET request to the DuckDuckGo search page
     # Send a GET request to the DuckDuckGo search page
     headers = {
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
     }
     }
-    
+
     response = requests.get(url, headers=headers)
     response = requests.get(url, headers=headers)
-    
+
     # Check if the request was successful
     # Check if the request was successful
     if response.status_code != 200:
     if response.status_code != 200:
-        print(f"Failed to retrieve search results. Status code: {response.status_code}")
+        print(
+            f"Failed to retrieve search results. Status code: {response.status_code}")
         return []
         return []
-    
+
     # Parse the HTML content using BeautifulSoup
     # Parse the HTML content using BeautifulSoup
     soup = BeautifulSoup(response.content, 'html.parser')
     soup = BeautifulSoup(response.content, 'html.parser')
-    
+
     # Find all result links (assuming they are in <a> tags with class "result__a")
     # Find all result links (assuming they are in <a> tags with class "result__a")
     result_links = []
     result_links = []
     for a_tag in soup.find_all('a', class_='result__a'):
     for a_tag in soup.find_all('a', class_='result__a'):
@@ -32,17 +40,17 @@ def duckduckgo_search(query, num_results=5):
             result_links.append(link)
             result_links.append(link)
             if len(result_links) >= num_results:
             if len(result_links) >= num_results:
                 break
                 break
-    
+
     return result_links
     return result_links
 
 
 
 
-def extract_text_from_links(links, timeout=5):
+def extract_text_from_links(links, timeout=5) -> list[tuple[str, str]]:
     extracted_texts = []
     extracted_texts = []
     headers = {
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
     }
     }
-    
-    for link in links:        
+
+    for link in links:
         try:
         try:
             response = requests.get(link, headers=headers, timeout=timeout)
             response = requests.get(link, headers=headers, timeout=timeout)
             if response.status_code == 200:
             if response.status_code == 200:
@@ -51,104 +59,123 @@ def extract_text_from_links(links, timeout=5):
                 text = soup.get_text(separator='\n', strip=True)
                 text = soup.get_text(separator='\n', strip=True)
                 extracted_texts.append((link, text))
                 extracted_texts.append((link, text))
             else:
             else:
-                print(f"Failed to retrieve content from {link}. Status code: {response.status_code}")
+                print(
+                    f"Failed to retrieve content from {link}. Status code: {response.status_code}")
         except requests.RequestException as e:
         except requests.RequestException as e:
             print(f"An error occurred while fetching {link}: {e}")
             print(f"An error occurred while fetching {link}: {e}")
-    
+
     return extracted_texts
     return extracted_texts
 
 
+def remove_tags(text):        
+    # Regular expression pattern to match '<think>' tags and their contents
+    pattern = r'<think>[\s\S]*?<\/think>\n\n'
+    # Replace all matches with an empty string
+    result = re.sub(pattern, '', text)
+    return result
 
 
-def summarize_individual_texts(texts_and_urls, query, model, api_url="http://localhost:8000/api/v1/completions"):
-    summaries = []
-    for url, text in texts_and_urls:
-        prompt = f"Extract the relevant information from the following text with regards to the original \
-        query: '{query}'\n\n{text}\n"
-        payload = {
-            "model": model,
-            "prompt": prompt,
-            "stream": False,
-            "max_tokens": 1000,
-	    "options": {
-		"num_ctx": 16384
-	    }	
-        }
-        
-        try:
-            response = requests.post(api_url, json=payload)
-            if response.status_code == 200:
-                #result = json.loads(response.text)["response"]
-                result_json = json.loads(response.text)
-                print(result_json)
-                result = result_json["choices"][0]["text"].strip()                
-                summaries.append((url, result))
-            else:
-                print(f"Failed to get summary from server for {url}. Status code: {response.status_code}")
-        except requests.RequestException as e:
-            print(f"An error occurred while sending request to server for {url}: {e}")
+def process_url(args):
+    "Helper function to summarize one individual source"
+    url, text, query, model, api_base, token = args
+    
+    client = OpenAI(
+        base_url=api_base,
+        api_key=token
+    )
+
+    prompt = f"""You are an expert summarizer. Your task is to evaluate which parts if any of the
+    following document are relevant to the user's query. Summarize the relevant parts of the given text with regards to the original query: '{query}'\
+    and include the full URLs as references where appropriate. Leave out everything that has little or no relevance to the query. If the text does not 
+    contain relevant information just return an empty response.
+    \n\n{text}"""
+
+    history = [{"role": "user", "content": prompt}]
+
+    try:
+        response = client.chat.completions.create(
+            model=model,
+            messages=history,
+            temperature=0,
+            max_tokens=1000
+        ).choices.pop().message.content        
+        return (url, remove_tags(response))
+    except requests.RequestException as e:
+        print(
+            f"An error occurred at summarization for {url}: {e}"
+        )
+        return (url, "")  # Return empty string on error
+
+
+def summarize_individual_texts(texts_and_urls, query, model, api_base, token):
+    # Generate text summaries in parallel using multiprocessing
+    args_list = [(url, text, query, model, api_base, token) for url, text in texts_and_urls]
+    
+    # Get number of CPUs to use
+    num_processes = mp.cpu_count()
+
+    # Create a process pool and process URLs in parallel
+    with Pool(processes=num_processes) as pool:
+        summaries = pool.map(process_url, args_list)
     
     
     return summaries
     return summaries
 
 
 
 
-def summarize(texts_and_urls, query, model, api_url="http://localhost:8000/api/v1/completions"):
+def summarize(texts_and_urls, query, model, api_base, token):
     # Prepare the context and prompt
     # Prepare the context and prompt
-    context = "\n".join([f"URL: {url}\nText: {text}" for url, text in texts_and_urls])
-    prompt = f"Summarize the following search results with regards to the original query: '{query}' \
+    context = "\n".join(
+        [f"URL: {url}\nText: {text}" for url, text in texts_and_urls])
+    prompt = f"""You are an expert summarizer. Your task is to evaluate which parts if any of the
+        following document are relevant to the user's query. Summarize the relevant parts of the given text with regards to the original query: '{query}' \
         and include the full URLs as references where appropriate. Use markdown to format your response. Add unicode characters where it makes sense to make the summary colorful. \
         and include the full URLs as references where appropriate. Use markdown to format your response. Add unicode characters where it makes sense to make the summary colorful. \
-        \n\n{context}"
-    
-    # Create the payload for the POST request
-    payload = {        
-        "model": model,
-        "prompt": prompt,
-        "stream": False,
-        # "max_tokens": 1500,
-        "options": {
-	    "num_ctx": 16384
-	}
-    }
-    
-    # Send the POST request to the server
+        \n\n{context}"""
+
+    client = OpenAI(
+        base_url=api_base,
+        api_key=token
+    )
+
+    history = [{"role": "user", "content": prompt}]
+
     try:
     try:
-        print("Processing")
-        response = requests.post(api_url, json=payload)
-        if response.status_code == 200:
-            #result = json.loads(response.text)["response"]
-            #return result            
-            result = json.loads(response.text)            
-            return(result["choices"][0]["text"].strip())
-        else:
-            print(f"Failed to get summary from the server. Status code: {response.status_code}")
-            return None
+        response = client.chat.completions.create(
+            model=model,
+            messages=history,
+            temperature=0,
+            max_tokens=2000
+        ).choices.pop().message.content
+        return remove_tags(response)
     except requests.RequestException as e:
     except requests.RequestException as e:
-        print(f"An error occurred while sending request to the server: {e}")
-        return None
+        print(
+            f"An error occurred at summarization for {url}: {e}"
+        )
 
 
 
 
-def optimize_search_query(query, query_model, api_url="http://localhost:8000/api/v1/completions"):
+def optimize_search_query(query, query_model, api_base):
     # Prepare the prompt for optimizing the search query
     # Prepare the prompt for optimizing the search query
     prompt = f"Optimize the following natural language query to improve its effectiveness in a web search.\
     prompt = f"Optimize the following natural language query to improve its effectiveness in a web search.\
         Make it very concise. Return only the optimized query text no additional texts, quotations or thoughts. Query: '{query}'"
         Make it very concise. Return only the optimized query text no additional texts, quotations or thoughts. Query: '{query}'"
-    
+
     # Create the payload for the POST request
     # Create the payload for the POST request
-    payload = {        
+    payload = {
         "model": query_model,
         "model": query_model,
         "prompt": prompt,
         "prompt": prompt,
         "stream": False,
         "stream": False,
         "max_tokens": 50
         "max_tokens": 50
     }
     }
-    
+
     # Send the POST request to the server
     # Send the POST request to the server
     try:
     try:
         print("Optimizing search query")
         print("Optimizing search query")
-        response = requests.post(api_url, json=payload)
+        response = requests.post(api_base, json=payload)
         if response.status_code == 200:
         if response.status_code == 200:
-            result = json.loads(response.text)            
-            return(result["choices"][0]["text"].strip())
+            result = json.loads(response.text)
+            return (result["choices"][0]["text"].strip())
         else:
         else:
-            print(f"Failed to optimize search query from the server. Status code: {response.status_code}")
+            print(
+                f"Failed to optimize search query from the server. Status code: {response.status_code}")
             return query
             return query
     except requests.RequestException as e:
     except requests.RequestException as e:
-        print(f"An error occurred while sending request to the server for optimizing the search query: {e}")
+        print(
+            f"An error occurred while sending request to the server for optimizing the search query: {e}")
         return query
         return query
 
 
 
 
@@ -156,43 +183,54 @@ def pretty_print_markdown(markdown_text):
     console = Console()
     console = Console()
     md = Markdown(markdown_text)
     md = Markdown(markdown_text)
     console.print(md)
     console.print(md)
-    
+
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
+    load_dotenv()
+    token = os.getenv("OVH_AI_ENDPOINTS_ACCESS_TOKEN")
+
     # Set up argument parser
     # Set up argument parser
-    parser = argparse.ArgumentParser(description="Search DuckDuckGo, extract text from results, and summarize with LLM.")
-    parser.add_argument("query", type=str, help="The search query to use on DuckDuckGo")
-    parser.add_argument("--num_results", type=int, default=5, help="Number of search results to process (default: 5)")
-    
+    parser = argparse.ArgumentParser(
+        description="Search DuckDuckGo, extract text from results, and summarize with LLM.")
+    parser.add_argument("query", type=str,
+                        help="The search query to use on DuckDuckGo")
+    parser.add_argument("--num_results", type=int, default=5,
+                        help="Number of search results to process (default: 5)")
+
     # Parse arguments
     # Parse arguments
     args = parser.parse_args()
     args = parser.parse_args()
-    
+
+    api_base = "https://oai.endpoints.kepler.ai.cloud.ovh.net/v1/"
     original_query = args.query
     original_query = args.query
-    query_model = "Gemma-3-4b-it-GGUF"    
-    #query_model = "Qwen3-1.7B-GGUF"
-    #summary_model = "Qwen3-4B-Instruct-2507-GGUF"
-    summary_model = "Qwen3-1.7B-GGUF"
-    #final_summary_model = "Gemma-3-4b-it-GGUF"
-    final_summary_model = "Qwen3-1.7B-GGUF"
-
-    # Optimize the search query
-    optimized_query = optimize_search_query(original_query, query_model)
-    print(f"Original Query: {original_query}")
-    print(f"Optimized Query: {optimized_query}")
+    summary_model = "Qwen3-32B"
+
+    print(f"Query: {original_query}")
 
 
     n = args.num_results  # Number of results to extract
     n = args.num_results  # Number of results to extract
-    links = duckduckgo_search(optimized_query, n)
-    
+    links = duckduckgo_search(original_query, n)
+
     print(f"Top {n} search results:")
     print(f"Top {n} search results:")
     for i, link in enumerate(links, start=1):
     for i, link in enumerate(links, start=1):
         print(f"{i}. {link}")
         print(f"{i}. {link}")
-    
-    texts_and_urls = extract_text_from_links(links)    
-    
+
+    texts_and_urls = extract_text_from_links(links)
+
     print("Summarizing individual search results")
     print("Summarizing individual search results")
-    intermediate_summaries = summarize_individual_texts(texts_and_urls, original_query, summary_model)
-    final_summary = summarize(intermediate_summaries, original_query, final_summary_model)
-    
+    intermediate_summaries = summarize_individual_texts(
+        texts_and_urls,
+        original_query,
+        summary_model,
+        api_base,
+        token
+    )
+
+    final_summary = summarize(
+        intermediate_summaries,
+        original_query,
+        summary_model,
+        api_base,
+        token)
+
     if final_summary:
     if final_summary:
-        print("\nFinal Summary of search results:\n")
+        print("\n################################# Final Summary of search results ################################# \n")        
         pretty_print_markdown(final_summary)
         pretty_print_markdown(final_summary)