|
|
@@ -1,29 +1,37 @@
|
|
|
+import os
|
|
|
+import re
|
|
|
+from openai import OpenAI
|
|
|
+from dotenv import load_dotenv
|
|
|
import requests
|
|
|
from bs4 import BeautifulSoup
|
|
|
import json
|
|
|
import argparse
|
|
|
from rich.console import Console
|
|
|
from rich.markdown import Markdown
|
|
|
+from multiprocessing import Pool
|
|
|
+import multiprocessing as mp
|
|
|
+
|
|
|
|
|
|
def duckduckgo_search(query, num_results=5):
|
|
|
# Construct the DuckDuckGo URL for the search query
|
|
|
url = f"https://html.duckduckgo.com/html/?q={query}"
|
|
|
-
|
|
|
+
|
|
|
# Send a GET request to the DuckDuckGo search page
|
|
|
headers = {
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
response = requests.get(url, headers=headers)
|
|
|
-
|
|
|
+
|
|
|
# Check if the request was successful
|
|
|
if response.status_code != 200:
|
|
|
- print(f"Failed to retrieve search results. Status code: {response.status_code}")
|
|
|
+ print(
|
|
|
+ f"Failed to retrieve search results. Status code: {response.status_code}")
|
|
|
return []
|
|
|
-
|
|
|
+
|
|
|
# Parse the HTML content using BeautifulSoup
|
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
-
|
|
|
+
|
|
|
# Find all result links (assuming they are in <a> tags with class "result__a")
|
|
|
result_links = []
|
|
|
for a_tag in soup.find_all('a', class_='result__a'):
|
|
|
@@ -32,17 +40,17 @@ def duckduckgo_search(query, num_results=5):
|
|
|
result_links.append(link)
|
|
|
if len(result_links) >= num_results:
|
|
|
break
|
|
|
-
|
|
|
+
|
|
|
return result_links
|
|
|
|
|
|
|
|
|
-def extract_text_from_links(links, timeout=5):
|
|
|
+def extract_text_from_links(links, timeout=5) -> list[tuple[str, str]]:
|
|
|
extracted_texts = []
|
|
|
headers = {
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
|
|
|
}
|
|
|
-
|
|
|
- for link in links:
|
|
|
+
|
|
|
+ for link in links:
|
|
|
try:
|
|
|
response = requests.get(link, headers=headers, timeout=timeout)
|
|
|
if response.status_code == 200:
|
|
|
@@ -51,104 +59,123 @@ def extract_text_from_links(links, timeout=5):
|
|
|
text = soup.get_text(separator='\n', strip=True)
|
|
|
extracted_texts.append((link, text))
|
|
|
else:
|
|
|
- print(f"Failed to retrieve content from {link}. Status code: {response.status_code}")
|
|
|
+ print(
|
|
|
+ f"Failed to retrieve content from {link}. Status code: {response.status_code}")
|
|
|
except requests.RequestException as e:
|
|
|
print(f"An error occurred while fetching {link}: {e}")
|
|
|
-
|
|
|
+
|
|
|
return extracted_texts
|
|
|
|
|
|
+def remove_tags(text):
|
|
|
+ # Regular expression pattern to match '<think>' tags and their contents
|
|
|
+ pattern = r'<think>[\s\S]*?<\/think>\n\n'
|
|
|
+ # Replace all matches with an empty string
|
|
|
+ result = re.sub(pattern, '', text)
|
|
|
+ return result
|
|
|
|
|
|
-def summarize_individual_texts(texts_and_urls, query, model, api_url="http://localhost:8000/api/v1/completions"):
|
|
|
- summaries = []
|
|
|
- for url, text in texts_and_urls:
|
|
|
- prompt = f"Extract the relevant information from the following text with regards to the original \
|
|
|
- query: '{query}'\n\n{text}\n"
|
|
|
- payload = {
|
|
|
- "model": model,
|
|
|
- "prompt": prompt,
|
|
|
- "stream": False,
|
|
|
- "max_tokens": 1000,
|
|
|
- "options": {
|
|
|
- "num_ctx": 16384
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- try:
|
|
|
- response = requests.post(api_url, json=payload)
|
|
|
- if response.status_code == 200:
|
|
|
- #result = json.loads(response.text)["response"]
|
|
|
- result_json = json.loads(response.text)
|
|
|
- print(result_json)
|
|
|
- result = result_json["choices"][0]["text"].strip()
|
|
|
- summaries.append((url, result))
|
|
|
- else:
|
|
|
- print(f"Failed to get summary from server for {url}. Status code: {response.status_code}")
|
|
|
- except requests.RequestException as e:
|
|
|
- print(f"An error occurred while sending request to server for {url}: {e}")
|
|
|
+def process_url(args):
|
|
|
+ "Helper function to summarize one individual source"
|
|
|
+ url, text, query, model, api_base, token = args
|
|
|
+
|
|
|
+ client = OpenAI(
|
|
|
+ base_url=api_base,
|
|
|
+ api_key=token
|
|
|
+ )
|
|
|
+
|
|
|
+ prompt = f"""You are an expert summarizer. Your task is to evaluate which parts if any of the
|
|
|
+ following document are relevant to the user's query. Summarize the relevant parts of the given text with regards to the original query: '{query}'\
|
|
|
+ and include the full URLs as references where appropriate. Leave out everything that has little or no relevance to the query. If the text does not
|
|
|
+ contain relevant information just return an empty response.
|
|
|
+ \n\n{text}"""
|
|
|
+
|
|
|
+ history = [{"role": "user", "content": prompt}]
|
|
|
+
|
|
|
+ try:
|
|
|
+ response = client.chat.completions.create(
|
|
|
+ model=model,
|
|
|
+ messages=history,
|
|
|
+ temperature=0,
|
|
|
+ max_tokens=1000
|
|
|
+ ).choices.pop().message.content
|
|
|
+ return (url, remove_tags(response))
|
|
|
+ except requests.RequestException as e:
|
|
|
+ print(
|
|
|
+ f"An error occurred at summarization for {url}: {e}"
|
|
|
+ )
|
|
|
+ return (url, "") # Return empty string on error
|
|
|
+
|
|
|
+
|
|
|
+def summarize_individual_texts(texts_and_urls, query, model, api_base, token):
|
|
|
+ # Generate text summaries in parallel using multiprocessing
|
|
|
+ args_list = [(url, text, query, model, api_base, token) for url, text in texts_and_urls]
|
|
|
+
|
|
|
+ # Get number of CPUs to use
|
|
|
+ num_processes = mp.cpu_count()
|
|
|
+
|
|
|
+ # Create a process pool and process URLs in parallel
|
|
|
+ with Pool(processes=num_processes) as pool:
|
|
|
+ summaries = pool.map(process_url, args_list)
|
|
|
|
|
|
return summaries
|
|
|
|
|
|
|
|
|
-def summarize(texts_and_urls, query, model, api_url="http://localhost:8000/api/v1/completions"):
|
|
|
+def summarize(texts_and_urls, query, model, api_base, token):
|
|
|
# Prepare the context and prompt
|
|
|
- context = "\n".join([f"URL: {url}\nText: {text}" for url, text in texts_and_urls])
|
|
|
- prompt = f"Summarize the following search results with regards to the original query: '{query}' \
|
|
|
+ context = "\n".join(
|
|
|
+ [f"URL: {url}\nText: {text}" for url, text in texts_and_urls])
|
|
|
+ prompt = f"""You are an expert summarizer. Your task is to evaluate which parts if any of the
|
|
|
+ following document are relevant to the user's query. Summarize the relevant parts of the given text with regards to the original query: '{query}' \
|
|
|
and include the full URLs as references where appropriate. Use markdown to format your response. Add unicode characters where it makes sense to make the summary colorful. \
|
|
|
- \n\n{context}"
|
|
|
-
|
|
|
- # Create the payload for the POST request
|
|
|
- payload = {
|
|
|
- "model": model,
|
|
|
- "prompt": prompt,
|
|
|
- "stream": False,
|
|
|
- # "max_tokens": 1500,
|
|
|
- "options": {
|
|
|
- "num_ctx": 16384
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- # Send the POST request to the server
|
|
|
+ \n\n{context}"""
|
|
|
+
|
|
|
+ client = OpenAI(
|
|
|
+ base_url=api_base,
|
|
|
+ api_key=token
|
|
|
+ )
|
|
|
+
|
|
|
+ history = [{"role": "user", "content": prompt}]
|
|
|
+
|
|
|
try:
|
|
|
- print("Processing")
|
|
|
- response = requests.post(api_url, json=payload)
|
|
|
- if response.status_code == 200:
|
|
|
- #result = json.loads(response.text)["response"]
|
|
|
- #return result
|
|
|
- result = json.loads(response.text)
|
|
|
- return(result["choices"][0]["text"].strip())
|
|
|
- else:
|
|
|
- print(f"Failed to get summary from the server. Status code: {response.status_code}")
|
|
|
- return None
|
|
|
+ response = client.chat.completions.create(
|
|
|
+ model=model,
|
|
|
+ messages=history,
|
|
|
+ temperature=0,
|
|
|
+ max_tokens=2000
|
|
|
+ ).choices.pop().message.content
|
|
|
+ return remove_tags(response)
|
|
|
except requests.RequestException as e:
|
|
|
- print(f"An error occurred while sending request to the server: {e}")
|
|
|
- return None
|
|
|
+ print(
|
|
|
+ f"An error occurred at summarization for {url}: {e}"
|
|
|
+ )
|
|
|
|
|
|
|
|
|
-def optimize_search_query(query, query_model, api_url="http://localhost:8000/api/v1/completions"):
|
|
|
+def optimize_search_query(query, query_model, api_base):
|
|
|
# Prepare the prompt for optimizing the search query
|
|
|
prompt = f"Optimize the following natural language query to improve its effectiveness in a web search.\
|
|
|
Make it very concise. Return only the optimized query text no additional texts, quotations or thoughts. Query: '{query}'"
|
|
|
-
|
|
|
+
|
|
|
# Create the payload for the POST request
|
|
|
- payload = {
|
|
|
+ payload = {
|
|
|
"model": query_model,
|
|
|
"prompt": prompt,
|
|
|
"stream": False,
|
|
|
"max_tokens": 50
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
# Send the POST request to the server
|
|
|
try:
|
|
|
print("Optimizing search query")
|
|
|
- response = requests.post(api_url, json=payload)
|
|
|
+ response = requests.post(api_base, json=payload)
|
|
|
if response.status_code == 200:
|
|
|
- result = json.loads(response.text)
|
|
|
- return(result["choices"][0]["text"].strip())
|
|
|
+ result = json.loads(response.text)
|
|
|
+ return (result["choices"][0]["text"].strip())
|
|
|
else:
|
|
|
- print(f"Failed to optimize search query from the server. Status code: {response.status_code}")
|
|
|
+ print(
|
|
|
+ f"Failed to optimize search query from the server. Status code: {response.status_code}")
|
|
|
return query
|
|
|
except requests.RequestException as e:
|
|
|
- print(f"An error occurred while sending request to the server for optimizing the search query: {e}")
|
|
|
+ print(
|
|
|
+ f"An error occurred while sending request to the server for optimizing the search query: {e}")
|
|
|
return query
|
|
|
|
|
|
|
|
|
@@ -156,43 +183,54 @@ def pretty_print_markdown(markdown_text):
|
|
|
console = Console()
|
|
|
md = Markdown(markdown_text)
|
|
|
console.print(md)
|
|
|
-
|
|
|
+
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
+ load_dotenv()
|
|
|
+ token = os.getenv("OVH_AI_ENDPOINTS_ACCESS_TOKEN")
|
|
|
+
|
|
|
# Set up argument parser
|
|
|
- parser = argparse.ArgumentParser(description="Search DuckDuckGo, extract text from results, and summarize with LLM.")
|
|
|
- parser.add_argument("query", type=str, help="The search query to use on DuckDuckGo")
|
|
|
- parser.add_argument("--num_results", type=int, default=5, help="Number of search results to process (default: 5)")
|
|
|
-
|
|
|
+ parser = argparse.ArgumentParser(
|
|
|
+ description="Search DuckDuckGo, extract text from results, and summarize with LLM.")
|
|
|
+ parser.add_argument("query", type=str,
|
|
|
+ help="The search query to use on DuckDuckGo")
|
|
|
+ parser.add_argument("--num_results", type=int, default=5,
|
|
|
+ help="Number of search results to process (default: 5)")
|
|
|
+
|
|
|
# Parse arguments
|
|
|
args = parser.parse_args()
|
|
|
-
|
|
|
+
|
|
|
+ api_base = "https://oai.endpoints.kepler.ai.cloud.ovh.net/v1/"
|
|
|
original_query = args.query
|
|
|
- query_model = "Gemma-3-4b-it-GGUF"
|
|
|
- #query_model = "Qwen3-1.7B-GGUF"
|
|
|
- #summary_model = "Qwen3-4B-Instruct-2507-GGUF"
|
|
|
- summary_model = "Qwen3-1.7B-GGUF"
|
|
|
- #final_summary_model = "Gemma-3-4b-it-GGUF"
|
|
|
- final_summary_model = "Qwen3-1.7B-GGUF"
|
|
|
-
|
|
|
- # Optimize the search query
|
|
|
- optimized_query = optimize_search_query(original_query, query_model)
|
|
|
- print(f"Original Query: {original_query}")
|
|
|
- print(f"Optimized Query: {optimized_query}")
|
|
|
+ summary_model = "Qwen3-32B"
|
|
|
+
|
|
|
+ print(f"Query: {original_query}")
|
|
|
|
|
|
n = args.num_results # Number of results to extract
|
|
|
- links = duckduckgo_search(optimized_query, n)
|
|
|
-
|
|
|
+ links = duckduckgo_search(original_query, n)
|
|
|
+
|
|
|
print(f"Top {n} search results:")
|
|
|
for i, link in enumerate(links, start=1):
|
|
|
print(f"{i}. {link}")
|
|
|
-
|
|
|
- texts_and_urls = extract_text_from_links(links)
|
|
|
-
|
|
|
+
|
|
|
+ texts_and_urls = extract_text_from_links(links)
|
|
|
+
|
|
|
print("Summarizing individual search results")
|
|
|
- intermediate_summaries = summarize_individual_texts(texts_and_urls, original_query, summary_model)
|
|
|
- final_summary = summarize(intermediate_summaries, original_query, final_summary_model)
|
|
|
-
|
|
|
+ intermediate_summaries = summarize_individual_texts(
|
|
|
+ texts_and_urls,
|
|
|
+ original_query,
|
|
|
+ summary_model,
|
|
|
+ api_base,
|
|
|
+ token
|
|
|
+ )
|
|
|
+
|
|
|
+ final_summary = summarize(
|
|
|
+ intermediate_summaries,
|
|
|
+ original_query,
|
|
|
+ summary_model,
|
|
|
+ api_base,
|
|
|
+ token)
|
|
|
+
|
|
|
if final_summary:
|
|
|
- print("\nFinal Summary of search results:\n")
|
|
|
+ print("\n################################# Final Summary of search results ################################# \n")
|
|
|
pretty_print_markdown(final_summary)
|