6 次代码提交 433840d9da ... e7fccaf8fb

作者 SHA1 备注 提交日期
  david e7fccaf8fb multiple changes 3 周之前
  david b699220755 use it 1 月之前
  david 94354d57d3 use ovhcloud 1 月之前
  david b6dd239eaf test various models 1 月之前
  david dec7c7620e update api endpoint and response format 1 月之前
  david 0961792d5e add gitignore 1 月之前
共有 2 个文件被更改,包括 145 次插入98 次删除
  1. 2 0
      .gitignore
  2. 143 98
      main.py

+ 2 - 0
.gitignore

@@ -0,0 +1,2 @@
+.vscode/
+.env

+ 143 - 98
main.py

@@ -1,29 +1,37 @@
+import os
+import re
+from openai import OpenAI
+from dotenv import load_dotenv
 import requests
 import requests
 from bs4 import BeautifulSoup
 from bs4 import BeautifulSoup
 import json
 import json
 import argparse
 import argparse
 from rich.console import Console
 from rich.console import Console
 from rich.markdown import Markdown
 from rich.markdown import Markdown
+from multiprocessing import Pool
+import multiprocessing as mp
+
 
 
 def duckduckgo_search(query, num_results=5):
 def duckduckgo_search(query, num_results=5):
     # Construct the DuckDuckGo URL for the search query
     # Construct the DuckDuckGo URL for the search query
     url = f"https://html.duckduckgo.com/html/?q={query}"
     url = f"https://html.duckduckgo.com/html/?q={query}"
-    
+
     # Send a GET request to the DuckDuckGo search page
     # Send a GET request to the DuckDuckGo search page
     headers = {
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
     }
     }
-    
+
     response = requests.get(url, headers=headers)
     response = requests.get(url, headers=headers)
-    
+
     # Check if the request was successful
     # Check if the request was successful
     if response.status_code != 200:
     if response.status_code != 200:
-        print(f"Failed to retrieve search results. Status code: {response.status_code}")
+        print(
+            f"Failed to retrieve search results. Status code: {response.status_code}")
         return []
         return []
-    
+
     # Parse the HTML content using BeautifulSoup
     # Parse the HTML content using BeautifulSoup
     soup = BeautifulSoup(response.content, 'html.parser')
     soup = BeautifulSoup(response.content, 'html.parser')
-    
+
     # Find all result links (assuming they are in <a> tags with class "result__a")
     # Find all result links (assuming they are in <a> tags with class "result__a")
     result_links = []
     result_links = []
     for a_tag in soup.find_all('a', class_='result__a'):
     for a_tag in soup.find_all('a', class_='result__a'):
@@ -32,17 +40,17 @@ def duckduckgo_search(query, num_results=5):
             result_links.append(link)
             result_links.append(link)
             if len(result_links) >= num_results:
             if len(result_links) >= num_results:
                 break
                 break
-    
+
     return result_links
     return result_links
 
 
 
 
-def extract_text_from_links(links, timeout=5):
+def extract_text_from_links(links, timeout=5) -> list[tuple[str, str]]:
     extracted_texts = []
     extracted_texts = []
     headers = {
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
     }
     }
-    
-    for link in links:        
+
+    for link in links:
         try:
         try:
             response = requests.get(link, headers=headers, timeout=timeout)
             response = requests.get(link, headers=headers, timeout=timeout)
             if response.status_code == 200:
             if response.status_code == 200:
@@ -51,99 +59,123 @@ def extract_text_from_links(links, timeout=5):
                 text = soup.get_text(separator='\n', strip=True)
                 text = soup.get_text(separator='\n', strip=True)
                 extracted_texts.append((link, text))
                 extracted_texts.append((link, text))
             else:
             else:
-                print(f"Failed to retrieve content from {link}. Status code: {response.status_code}")
+                print(
+                    f"Failed to retrieve content from {link}. Status code: {response.status_code}")
         except requests.RequestException as e:
         except requests.RequestException as e:
             print(f"An error occurred while fetching {link}: {e}")
             print(f"An error occurred while fetching {link}: {e}")
-    
+
     return extracted_texts
     return extracted_texts
 
 
+def remove_tags(text):        
+    # Regular expression pattern to match '<think>' tags and their contents
+    pattern = r'<think>[\s\S]*?<\/think>\n\n'
+    # Replace all matches with an empty string
+    result = re.sub(pattern, '', text)
+    return result
 
 
-def summarize_individual_texts(texts_and_urls, query, model, ollama_url="http://localhost:11434/api/generate"):
-    summaries = []
-    for url, text in texts_and_urls:
-        prompt = f"Extract the relevant information from the following text with regards to the original \
-        query: '{query}'\n\n{text}\n"
-        payload = {
-            "model": model,
-            "prompt": prompt,
-            "stream": False,
-            "max_tokens": 1000,
-	    "options": {
-		"num_ctx": 16384
-	    }	
-        }
-        
-        try:
-            response = requests.post(ollama_url, json=payload)
-            if response.status_code == 200:
-                result = json.loads(response.text)["response"]
-                summaries.append((url, result))
-            else:
-                print(f"Failed to get summary from Ollama server for {url}. Status code: {response.status_code}")
-        except requests.RequestException as e:
-            print(f"An error occurred while sending request to Ollama server for {url}: {e}")
+def process_url(args):
+    "Helper function to summarize one individual source"
+    url, text, query, model, api_base, token = args
+    
+    client = OpenAI(
+        base_url=api_base,
+        api_key=token
+    )
+
+    prompt = f"""You are an expert summarizer. Your task is to evaluate which parts if any of the
+    following document are relevant to the user's query. Summarize the relevant parts of the given text with regards to the original query: '{query}'\
+    and include the full URLs as references where appropriate. Leave out everything that has little or no relevance to the query. If the text does not 
+    contain relevant information just return an empty response.
+    \n\n{text}"""
+
+    history = [{"role": "user", "content": prompt}]
+
+    try:
+        response = client.chat.completions.create(
+            model=model,
+            messages=history,
+            temperature=0,
+            max_tokens=1000
+        ).choices.pop().message.content        
+        return (url, remove_tags(response))
+    except Exception as e:
+        print(
+            f"An error occurred at summarization for {url}: {e}"
+        )
+        return (url, "")  # Return empty string on error
+
+
+def summarize_individual_texts(texts_and_urls, query, model, api_base, token):
+    # Generate text summaries in parallel using multiprocessing
+    args_list = [(url, text, query, model, api_base, token) for url, text in texts_and_urls]
+    
+    # Get number of CPUs to use
+    num_processes = mp.cpu_count()
+
+    # Create a process pool and process URLs in parallel
+    with Pool(processes=num_processes) as pool:
+        summaries = pool.map(process_url, args_list)
     
     
     return summaries
     return summaries
 
 
 
 
-def summarize_with_ollama(texts_and_urls, query, model, ollama_url="http://localhost:11434/api/generate"):
+def summarize(texts_and_urls, query, model, api_base, token):
     # Prepare the context and prompt
     # Prepare the context and prompt
-    context = "\n".join([f"URL: {url}\nText: {text}" for url, text in texts_and_urls])
-    prompt = f"Summarize the following search results with regards to the original query: '{query}' \
+    context = "\n".join(
+        [f"URL: {url}\nText: {text}" for url, text in texts_and_urls])
+    prompt = f"""You are an expert summarizer. Your task is to evaluate which parts if any of the
+        following document are relevant to the user's query. Summarize the relevant parts of the given text with regards to the original query: '{query}' \
         and include the full URLs as references where appropriate. Use markdown to format your response. Add unicode characters where it makes sense to make the summary colorful. \
         and include the full URLs as references where appropriate. Use markdown to format your response. Add unicode characters where it makes sense to make the summary colorful. \
-        \n\n{context}"
-    
-    # Create the payload for the POST request
-    payload = {        
-        "model": model,
-        "prompt": prompt,
-        "stream": False,
-        "max_tokens": 1500,
-        "options": {
-	    "num_ctx": 16384
-	}
-    }
-    
-    # Send the POST request to the Ollama server
+        \n\n{context}"""
+
+    client = OpenAI(
+        base_url=api_base,
+        api_key=token
+    )
+
+    history = [{"role": "user", "content": prompt}]
+
     try:
     try:
-        print("Processing")
-        response = requests.post(ollama_url, json=payload)
-        if response.status_code == 200:
-            result = json.loads(response.text)["response"]
-            return result
-        else:
-            print(f"Failed to get summary from Ollama server. Status code: {response.status_code}")
-            return None
-    except requests.RequestException as e:
-        print(f"An error occurred while sending request to Ollama server: {e}")
-        return None
+        response = client.chat.completions.create(
+            model=model,
+            messages=history,
+            temperature=0,
+            max_tokens=2000
+        ).choices.pop().message.content
+        return remove_tags(response)
+    except Exception as e:
+        print(
+            f"An error occurred at summarization for {url}: {e}"
+        )
 
 
 
 
-def optimize_search_query(query, query_model, ollama_url="http://localhost:11434/api/generate"):
+def optimize_search_query(query, query_model, api_base):
     # Prepare the prompt for optimizing the search query
     # Prepare the prompt for optimizing the search query
     prompt = f"Optimize the following natural language query to improve its effectiveness in a web search.\
     prompt = f"Optimize the following natural language query to improve its effectiveness in a web search.\
-        Make it very concise. Return just the optimized query no explanations. Query: '{query}'"
-    
+        Make it very concise. Return only exactly the optimized query text no additional texts, quotations or thoughts. Query: '{query}'"
+
     # Create the payload for the POST request
     # Create the payload for the POST request
-    payload = {        
+    payload = {
         "model": query_model,
         "model": query_model,
         "prompt": prompt,
         "prompt": prompt,
         "stream": False,
         "stream": False,
         "max_tokens": 50
         "max_tokens": 50
     }
     }
-    
-    # Send the POST request to the Ollama server
+
+    # Send the POST request to the server
     try:
     try:
         print("Optimizing search query")
         print("Optimizing search query")
-        response = requests.post(ollama_url, json=payload)
+        response = requests.post(api_base, json=payload)
         if response.status_code == 200:
         if response.status_code == 200:
-            result = json.loads(response.text)["response"].strip()
-            return result.strip('"')
+            result = json.loads(response.text)
+            return (result["choices"][0]["text"].strip())
         else:
         else:
-            print(f"Failed to optimize search query from Ollama server. Status code: {response.status_code}")
+            print(
+                f"Failed to optimize search query from the server. Status code: {response.status_code}")
             return query
             return query
-    except requests.RequestException as e:
-        print(f"An error occurred while sending request to Ollama server for optimizing the search query: {e}")
+    except Exception as e:
+        print(
+            f"An error occurred while sending request to the server for optimizing the search query: {e}")
         return query
         return query
 
 
 
 
@@ -151,41 +183,54 @@ def pretty_print_markdown(markdown_text):
     console = Console()
     console = Console()
     md = Markdown(markdown_text)
     md = Markdown(markdown_text)
     console.print(md)
     console.print(md)
-    
+
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
+    load_dotenv()
+    token = os.getenv("OVH_AI_ENDPOINTS_ACCESS_TOKEN")
+    api_base = os.getenv("API_BASE")    
+    summary_model = os.getenv("SUMMARY_MODEL")
+
     # Set up argument parser
     # Set up argument parser
-    parser = argparse.ArgumentParser(description="Search DuckDuckGo, extract text from results, and summarize with Ollama.")
-    parser.add_argument("query", type=str, help="The search query to use on DuckDuckGo")
-    parser.add_argument("--num_results", type=int, default=5, help="Number of search results to process (default: 5)")
-    
+    parser = argparse.ArgumentParser(
+        description="Search DuckDuckGo, extract text from results, and summarize with LLM.")
+    parser.add_argument("query", type=str,
+                        help="The search query to use on DuckDuckGo")
+    parser.add_argument("--num_results", type=int, default=5,
+                        help="Number of search results to process (default: 5)")
+
     # Parse arguments
     # Parse arguments
     args = parser.parse_args()
     args = parser.parse_args()
+
     
     
     original_query = args.query
     original_query = args.query
-    model = "command-r"
-    #model = "qwq"
-    #model = "qwen2.5:32b"
-    query_model = model
-
-    # Optimize the search query
-    optimized_query = optimize_search_query(original_query, query_model)
-    print(f"Original Query: {original_query}")
-    print(f"Optimized Query: {optimized_query}")
+    
+    print(f"Query: {original_query}")
 
 
     n = args.num_results  # Number of results to extract
     n = args.num_results  # Number of results to extract
-    links = duckduckgo_search(optimized_query, n)
-    
+    links = duckduckgo_search(original_query, n)
+
     print(f"Top {n} search results:")
     print(f"Top {n} search results:")
     for i, link in enumerate(links, start=1):
     for i, link in enumerate(links, start=1):
         print(f"{i}. {link}")
         print(f"{i}. {link}")
-    
+
     texts_and_urls = extract_text_from_links(links)
     texts_and_urls = extract_text_from_links(links)
-    
+
     print("Summarizing individual search results")
     print("Summarizing individual search results")
-    intermediate_summaries = summarize_individual_texts(texts_and_urls, original_query, model)
-    final_summary = summarize_with_ollama(intermediate_summaries, original_query, model)
-    
+    intermediate_summaries = summarize_individual_texts(
+        texts_and_urls,
+        original_query,
+        summary_model,
+        api_base,
+        token
+    )
+
+    final_summary = summarize(
+        intermediate_summaries,
+        original_query,
+        summary_model,
+        api_base,
+        token)
+
     if final_summary:
     if final_summary:
-        print("\nFinal Summary of search results:\n")
         pretty_print_markdown(final_summary)
         pretty_print_markdown(final_summary)