|
|
@@ -0,0 +1,181 @@
|
|
|
+import requests
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+import json
|
|
|
+import argparse
|
|
|
+from rich.console import Console
|
|
|
+from rich.markdown import Markdown
|
|
|
+
|
|
|
+def duckduckgo_search(query, num_results=5):
|
|
|
+ # Construct the DuckDuckGo URL for the search query
|
|
|
+ url = f"https://html.duckduckgo.com/html/?q={query}"
|
|
|
+
|
|
|
+ # Send a GET request to the DuckDuckGo search page
|
|
|
+ headers = {
|
|
|
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
|
|
|
+ }
|
|
|
+
|
|
|
+ response = requests.get(url, headers=headers)
|
|
|
+
|
|
|
+ # Check if the request was successful
|
|
|
+ if response.status_code != 200:
|
|
|
+ print(f"Failed to retrieve search results. Status code: {response.status_code}")
|
|
|
+ return []
|
|
|
+
|
|
|
+ # Parse the HTML content using BeautifulSoup
|
|
|
+ soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
+
|
|
|
+ # Find all result links (assuming they are in <a> tags with class "result__a")
|
|
|
+ result_links = []
|
|
|
+ for a_tag in soup.find_all('a', class_='result__a'):
|
|
|
+ link = a_tag.get('href')
|
|
|
+ if link:
|
|
|
+ result_links.append(link)
|
|
|
+ if len(result_links) >= num_results:
|
|
|
+ break
|
|
|
+
|
|
|
+ return result_links
|
|
|
+
|
|
|
+
|
|
|
+def extract_text_from_links(links):
|
|
|
+ extracted_texts = []
|
|
|
+ headers = {
|
|
|
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
|
|
|
+ }
|
|
|
+
|
|
|
+ for link in links:
|
|
|
+ try:
|
|
|
+ response = requests.get(link, headers=headers)
|
|
|
+ if response.status_code == 200:
|
|
|
+ soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
+ # Extract text from the page
|
|
|
+ text = soup.get_text(separator='\n', strip=True)
|
|
|
+ extracted_texts.append((link, text))
|
|
|
+ else:
|
|
|
+ print(f"Failed to retrieve content from {link}. Status code: {response.status_code}")
|
|
|
+ except requests.RequestException as e:
|
|
|
+ print(f"An error occurred while fetching {link}: {e}")
|
|
|
+
|
|
|
+ return extracted_texts
|
|
|
+
|
|
|
+
|
|
|
+def summarize_individual_texts(texts_and_urls, query, ollama_url="http://localhost:11434/api/generate"):
|
|
|
+ summaries = []
|
|
|
+ for url, text in texts_and_urls:
|
|
|
+ prompt = f"Extract the relevant information from the following text with regards to the original \
|
|
|
+ query: '{query}'\n\n{text}\n"
|
|
|
+ payload = {
|
|
|
+ "model": "command-r",
|
|
|
+ "prompt": prompt,
|
|
|
+ "stream": False,
|
|
|
+ "max_tokens": 1000
|
|
|
+ }
|
|
|
+
|
|
|
+ try:
|
|
|
+ response = requests.post(ollama_url, json=payload)
|
|
|
+ if response.status_code == 200:
|
|
|
+ result = json.loads(response.text)["response"]
|
|
|
+ summaries.append((url, result))
|
|
|
+ else:
|
|
|
+ print(f"Failed to get summary from Ollama server for {url}. Status code: {response.status_code}")
|
|
|
+ except requests.RequestException as e:
|
|
|
+ print(f"An error occurred while sending request to Ollama server for {url}: {e}")
|
|
|
+
|
|
|
+ return summaries
|
|
|
+
|
|
|
+
|
|
|
+def summarize_with_ollama(texts_and_urls, query, ollama_url="http://localhost:11434/api/generate"):
|
|
|
+ # Prepare the context and prompt
|
|
|
+ context = "\n".join([f"URL: {url}\nText: {text}" for url, text in texts_and_urls])
|
|
|
+ prompt = f"Summarize the following search results with regards to the original query: '{query}' \
|
|
|
+ and include the full URLs as references where appropriate. Use markdown to format your response and unicode characters. \
|
|
|
+ \n\n{context}"
|
|
|
+
|
|
|
+ # Create the payload for the POST request
|
|
|
+ payload = {
|
|
|
+ "model": "command-r",
|
|
|
+ "prompt": prompt,
|
|
|
+ "stream": False,
|
|
|
+ "max_tokens": 1500
|
|
|
+ }
|
|
|
+
|
|
|
+ # Send the POST request to the Ollama server
|
|
|
+ try:
|
|
|
+ print("Processing")
|
|
|
+ response = requests.post(ollama_url, json=payload)
|
|
|
+ if response.status_code == 200:
|
|
|
+ result = json.loads(response.text)["response"]
|
|
|
+ return result
|
|
|
+ else:
|
|
|
+ print(f"Failed to get summary from Ollama server. Status code: {response.status_code}")
|
|
|
+ return None
|
|
|
+ except requests.RequestException as e:
|
|
|
+ print(f"An error occurred while sending request to Ollama server: {e}")
|
|
|
+ return None
|
|
|
+
|
|
|
+
|
|
|
+def optimize_search_query(query, ollama_url="http://localhost:11434/api/generate"):
|
|
|
+ # Prepare the prompt for optimizing the search query
|
|
|
+ prompt = f"Optimize the following natural language query to improve its effectiveness in a web search.\
|
|
|
+ Make it very concise. query: '{query}'"
|
|
|
+
|
|
|
+ # Create the payload for the POST request
|
|
|
+ payload = {
|
|
|
+ "model": "command-r",
|
|
|
+ "prompt": prompt,
|
|
|
+ "stream": False,
|
|
|
+ "max_tokens": 50
|
|
|
+ }
|
|
|
+
|
|
|
+ # Send the POST request to the Ollama server
|
|
|
+ try:
|
|
|
+ print("Optimizing search query")
|
|
|
+ response = requests.post(ollama_url, json=payload)
|
|
|
+ if response.status_code == 200:
|
|
|
+ result = json.loads(response.text)["response"].strip()
|
|
|
+ return result.strip('"')
|
|
|
+ else:
|
|
|
+ print(f"Failed to optimize search query from Ollama server. Status code: {response.status_code}")
|
|
|
+ return query
|
|
|
+ except requests.RequestException as e:
|
|
|
+ print(f"An error occurred while sending request to Ollama server for optimizing the search query: {e}")
|
|
|
+ return query
|
|
|
+
|
|
|
+
|
|
|
+def pretty_print_markdown(markdown_text):
|
|
|
+ console = Console()
|
|
|
+ md = Markdown(markdown_text)
|
|
|
+ console.print(md)
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ # Set up argument parser
|
|
|
+ parser = argparse.ArgumentParser(description="Search DuckDuckGo, extract text from results, and summarize with Ollama.")
|
|
|
+ parser.add_argument("query", type=str, help="The search query to use on DuckDuckGo")
|
|
|
+ parser.add_argument("--num_results", type=int, default=5, help="Number of search results to process (default: 5)")
|
|
|
+
|
|
|
+ # Parse arguments
|
|
|
+ args = parser.parse_args()
|
|
|
+
|
|
|
+ original_query = args.query
|
|
|
+ # Optimize the search query
|
|
|
+ optimized_query = optimize_search_query(original_query)
|
|
|
+ print(f"Original Query: {original_query}")
|
|
|
+ print(f"Optimized Query: {optimized_query}")
|
|
|
+
|
|
|
+ n = args.num_results # Number of results to extract
|
|
|
+ links = duckduckgo_search(optimized_query, n)
|
|
|
+
|
|
|
+ print(f"Top {n} search results:")
|
|
|
+ for i, link in enumerate(links, start=1):
|
|
|
+ print(f"{i}. {link}")
|
|
|
+
|
|
|
+ texts_and_urls = extract_text_from_links(links)
|
|
|
+
|
|
|
+ print("Summarizing individual search results")
|
|
|
+ intermediate_summaries = summarize_individual_texts(texts_and_urls, original_query)
|
|
|
+
|
|
|
+ final_summary = summarize_with_ollama(intermediate_summaries, original_query)
|
|
|
+
|
|
|
+ if final_summary:
|
|
|
+ print("\nFinal Summary of search results:\n")
|
|
|
+ pretty_print_markdown(final_summary)
|