import requests from bs4 import BeautifulSoup import json import argparse from rich.console import Console from rich.markdown import Markdown def duckduckgo_search(query, num_results=5): # Construct the DuckDuckGo URL for the search query url = f"https://html.duckduckgo.com/html/?q={query}" # Send a GET request to the DuckDuckGo search page headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' } response = requests.get(url, headers=headers) # Check if the request was successful if response.status_code != 200: print(f"Failed to retrieve search results. Status code: {response.status_code}") return [] # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(response.content, 'html.parser') # Find all result links (assuming they are in tags with class "result__a") result_links = [] for a_tag in soup.find_all('a', class_='result__a'): link = a_tag.get('href') if link: result_links.append(link) if len(result_links) >= num_results: break return result_links def extract_text_from_links(links, timeout=5): extracted_texts = [] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' } for link in links: try: response = requests.get(link, headers=headers, timeout=timeout) if response.status_code == 200: soup = BeautifulSoup(response.content, 'html.parser') # Extract text from the page text = soup.get_text(separator='\n', strip=True) extracted_texts.append((link, text)) else: print(f"Failed to retrieve content from {link}. Status code: {response.status_code}") except requests.RequestException as e: print(f"An error occurred while fetching {link}: {e}") return extracted_texts def summarize_individual_texts(texts_and_urls, query, model, api_url="http://localhost:8000/api/v1/completions"): summaries = [] for url, text in texts_and_urls: prompt = f"Extract the relevant information from the following text with regards to the original \ query: '{query}'\n\n{text}\n" payload = { "model": model, "prompt": prompt, "stream": False, "max_tokens": 1000, "options": { "num_ctx": 16384 } } try: response = requests.post(api_url, json=payload) if response.status_code == 200: #result = json.loads(response.text)["response"] result_json = json.loads(response.text) print(result_json) result = result_json["choices"][0]["text"].strip() summaries.append((url, result)) else: print(f"Failed to get summary from server for {url}. Status code: {response.status_code}") except requests.RequestException as e: print(f"An error occurred while sending request to server for {url}: {e}") return summaries def summarize(texts_and_urls, query, model, api_url="http://localhost:8000/api/v1/completions"): # Prepare the context and prompt context = "\n".join([f"URL: {url}\nText: {text}" for url, text in texts_and_urls]) prompt = f"Summarize the following search results with regards to the original query: '{query}' \ and include the full URLs as references where appropriate. Use markdown to format your response. Add unicode characters where it makes sense to make the summary colorful. \ \n\n{context}" # Create the payload for the POST request payload = { "model": model, "prompt": prompt, "stream": False, # "max_tokens": 1500, "options": { "num_ctx": 16384 } } # Send the POST request to the server try: print("Processing") response = requests.post(api_url, json=payload) if response.status_code == 200: #result = json.loads(response.text)["response"] #return result result = json.loads(response.text) return(result["choices"][0]["text"].strip()) else: print(f"Failed to get summary from the server. Status code: {response.status_code}") return None except requests.RequestException as e: print(f"An error occurred while sending request to the server: {e}") return None def optimize_search_query(query, query_model, api_url="http://localhost:8000/api/v1/completions"): # Prepare the prompt for optimizing the search query prompt = f"Optimize the following natural language query to improve its effectiveness in a web search.\ Make it very concise. Return only the optimized query text no additional texts, quotations or thoughts. Query: '{query}'" # Create the payload for the POST request payload = { "model": query_model, "prompt": prompt, "stream": False, "max_tokens": 50 } # Send the POST request to the server try: print("Optimizing search query") response = requests.post(api_url, json=payload) if response.status_code == 200: result = json.loads(response.text) return(result["choices"][0]["text"].strip()) else: print(f"Failed to optimize search query from the server. Status code: {response.status_code}") return query except requests.RequestException as e: print(f"An error occurred while sending request to the server for optimizing the search query: {e}") return query def pretty_print_markdown(markdown_text): console = Console() md = Markdown(markdown_text) console.print(md) if __name__ == "__main__": # Set up argument parser parser = argparse.ArgumentParser(description="Search DuckDuckGo, extract text from results, and summarize with LLM.") parser.add_argument("query", type=str, help="The search query to use on DuckDuckGo") parser.add_argument("--num_results", type=int, default=5, help="Number of search results to process (default: 5)") # Parse arguments args = parser.parse_args() original_query = args.query query_model = "Gemma-3-4b-it-GGUF" #query_model = "Qwen3-1.7B-GGUF" #summary_model = "Qwen3-4B-Instruct-2507-GGUF" summary_model = "Qwen3-1.7B-GGUF" #final_summary_model = "Gemma-3-4b-it-GGUF" final_summary_model = "Qwen3-1.7B-GGUF" # Optimize the search query optimized_query = optimize_search_query(original_query, query_model) print(f"Original Query: {original_query}") print(f"Optimized Query: {optimized_query}") n = args.num_results # Number of results to extract links = duckduckgo_search(optimized_query, n) print(f"Top {n} search results:") for i, link in enumerate(links, start=1): print(f"{i}. {link}") texts_and_urls = extract_text_from_links(links) print("Summarizing individual search results") intermediate_summaries = summarize_individual_texts(texts_and_urls, original_query, summary_model) final_summary = summarize(intermediate_summaries, original_query, final_summary_model) if final_summary: print("\nFinal Summary of search results:\n") pretty_print_markdown(final_summary)