import os import re from openai import OpenAI from dotenv import load_dotenv import requests from bs4 import BeautifulSoup import json import argparse from rich.console import Console from rich.markdown import Markdown from multiprocessing import Pool import multiprocessing as mp def duckduckgo_search(query, num_results=5): # Construct the DuckDuckGo URL for the search query url = f"https://html.duckduckgo.com/html/?q={query}" # Send a GET request to the DuckDuckGo search page headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' } response = requests.get(url, headers=headers) # Check if the request was successful if response.status_code != 200: print( f"Failed to retrieve search results. Status code: {response.status_code}") return [] # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(response.content, 'html.parser') # Find all result links (assuming they are in tags with class "result__a") result_links = [] for a_tag in soup.find_all('a', class_='result__a'): link = a_tag.get('href') if link: result_links.append(link) if len(result_links) >= num_results: break return result_links def extract_text_from_links(links, timeout=5) -> list[tuple[str, str]]: extracted_texts = [] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' } for link in links: try: response = requests.get(link, headers=headers, timeout=timeout) if response.status_code == 200: soup = BeautifulSoup(response.content, 'html.parser') # Extract text from the page text = soup.get_text(separator='\n', strip=True) extracted_texts.append((link, text)) else: print( f"Failed to retrieve content from {link}. Status code: {response.status_code}") except requests.RequestException as e: print(f"An error occurred while fetching {link}: {e}") return extracted_texts def remove_tags(text): # Regular expression pattern to match '' tags and their contents pattern = r'[\s\S]*?<\/think>\n\n' # Replace all matches with an empty string result = re.sub(pattern, '', text) return result def process_url(args): "Helper function to summarize one individual source" url, text, query, model, api_base, token = args client = OpenAI( base_url=api_base, api_key=token ) prompt = f"""You are an expert summarizer. Your task is to evaluate which parts if any of the following document are relevant to the user's query. Summarize the relevant parts of the given text with regards to the original query: '{query}'\ and include the full URLs as references where appropriate. Leave out everything that has little or no relevance to the query. If the text does not contain relevant information just return an empty response. \n\n{text}""" history = [{"role": "user", "content": prompt}] try: response = client.chat.completions.create( model=model, messages=history, temperature=0, max_tokens=1000 ).choices.pop().message.content return (url, remove_tags(response)) except Exception as e: print( f"An error occurred at summarization for {url}: {e}" ) return (url, "") # Return empty string on error def summarize_individual_texts(texts_and_urls, query, model, api_base, token): # Generate text summaries in parallel using multiprocessing args_list = [(url, text, query, model, api_base, token) for url, text in texts_and_urls] # Get number of CPUs to use num_processes = mp.cpu_count() # Create a process pool and process URLs in parallel with Pool(processes=num_processes) as pool: summaries = pool.map(process_url, args_list) return summaries def summarize(texts_and_urls, query, model, api_base, token): # Prepare the context and prompt context = "\n".join( [f"URL: {url}\nText: {text}" for url, text in texts_and_urls]) prompt = f"""You are an expert summarizer. Your task is to evaluate which parts if any of the following document are relevant to the user's query. Summarize the relevant parts of the given text with regards to the original query: '{query}' \ and include the full URLs as references where appropriate. Use markdown to format your response. Add unicode characters where it makes sense to make the summary colorful. \ \n\n{context}""" client = OpenAI( base_url=api_base, api_key=token ) history = [{"role": "user", "content": prompt}] try: response = client.chat.completions.create( model=model, messages=history, temperature=0, max_tokens=2000 ).choices.pop().message.content return remove_tags(response) except Exception as e: print( f"An error occurred at summarization for {url}: {e}" ) def optimize_search_query(query, query_model, api_base): # Prepare the prompt for optimizing the search query prompt = f"Optimize the following natural language query to improve its effectiveness in a web search.\ Make it very concise. Return only exactly the optimized query text no additional texts, quotations or thoughts. Query: '{query}'" # Create the payload for the POST request payload = { "model": query_model, "prompt": prompt, "stream": False, "max_tokens": 50 } # Send the POST request to the server try: print("Optimizing search query") response = requests.post(api_base, json=payload) if response.status_code == 200: result = json.loads(response.text) return (result["choices"][0]["text"].strip()) else: print( f"Failed to optimize search query from the server. Status code: {response.status_code}") return query except Exception as e: print( f"An error occurred while sending request to the server for optimizing the search query: {e}") return query def pretty_print_markdown(markdown_text): console = Console() md = Markdown(markdown_text) console.print(md) if __name__ == "__main__": load_dotenv() token = os.getenv("OVH_AI_ENDPOINTS_ACCESS_TOKEN") api_base = os.getenv("API_BASE") summary_model = os.getenv("SUMMARY_MODEL") # Set up argument parser parser = argparse.ArgumentParser( description="Search DuckDuckGo, extract text from results, and summarize with LLM.") parser.add_argument("query", type=str, help="The search query to use on DuckDuckGo") parser.add_argument("--num_results", type=int, default=5, help="Number of search results to process (default: 5)") # Parse arguments args = parser.parse_args() original_query = args.query print(f"Query: {original_query}") n = args.num_results # Number of results to extract links = duckduckgo_search(original_query, n) print(f"Top {n} search results:") for i, link in enumerate(links, start=1): print(f"{i}. {link}") texts_and_urls = extract_text_from_links(links) print("Summarizing individual search results") intermediate_summaries = summarize_individual_texts( texts_and_urls, original_query, summary_model, api_base, token ) final_summary = summarize( intermediate_summaries, original_query, summary_model, api_base, token) if final_summary: pretty_print_markdown(final_summary)