| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182 |
- import requests
- from bs4 import BeautifulSoup
- import json
- import argparse
- from rich.console import Console
- from rich.markdown import Markdown
- def duckduckgo_search(query, num_results=5):
- # Construct the DuckDuckGo URL for the search query
- url = f"https://html.duckduckgo.com/html/?q={query}"
-
- # Send a GET request to the DuckDuckGo search page
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
- }
-
- response = requests.get(url, headers=headers)
-
- # Check if the request was successful
- if response.status_code != 200:
- print(f"Failed to retrieve search results. Status code: {response.status_code}")
- return []
-
- # Parse the HTML content using BeautifulSoup
- soup = BeautifulSoup(response.content, 'html.parser')
-
- # Find all result links (assuming they are in <a> tags with class "result__a")
- result_links = []
- for a_tag in soup.find_all('a', class_='result__a'):
- link = a_tag.get('href')
- if link:
- result_links.append(link)
- if len(result_links) >= num_results:
- break
-
- return result_links
- def extract_text_from_links(links, timeout=5):
- extracted_texts = []
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
- }
-
- for link in links:
- print("downloading text from: " + link)
- try:
- response = requests.get(link, headers=headers, timeout=timeout)
- if response.status_code == 200:
- soup = BeautifulSoup(response.content, 'html.parser')
- # Extract text from the page
- text = soup.get_text(separator='\n', strip=True)
- extracted_texts.append((link, text))
- else:
- print(f"Failed to retrieve content from {link}. Status code: {response.status_code}")
- except requests.RequestException as e:
- print(f"An error occurred while fetching {link}: {e}")
-
- return extracted_texts
- def summarize_individual_texts(texts_and_urls, query, ollama_url="http://localhost:11434/api/generate"):
- summaries = []
- for url, text in texts_and_urls:
- prompt = f"Extract the relevant information from the following text with regards to the original \
- query: '{query}'\n\n{text}\n"
- payload = {
- "model": "command-r",
- "prompt": prompt,
- "stream": False,
- "max_tokens": 1000
- }
-
- try:
- response = requests.post(ollama_url, json=payload)
- if response.status_code == 200:
- result = json.loads(response.text)["response"]
- summaries.append((url, result))
- else:
- print(f"Failed to get summary from Ollama server for {url}. Status code: {response.status_code}")
- except requests.RequestException as e:
- print(f"An error occurred while sending request to Ollama server for {url}: {e}")
-
- return summaries
- def summarize_with_ollama(texts_and_urls, query, ollama_url="http://localhost:11434/api/generate"):
- # Prepare the context and prompt
- context = "\n".join([f"URL: {url}\nText: {text}" for url, text in texts_and_urls])
- prompt = f"Summarize the following search results with regards to the original query: '{query}' \
- and include the full URLs as references where appropriate. Use markdown to format your response and unicode characters. \
- \n\n{context}"
-
- # Create the payload for the POST request
- payload = {
- "model": "command-r",
- "prompt": prompt,
- "stream": False,
- "max_tokens": 1500
- }
-
- # Send the POST request to the Ollama server
- try:
- print("Processing")
- response = requests.post(ollama_url, json=payload)
- if response.status_code == 200:
- result = json.loads(response.text)["response"]
- return result
- else:
- print(f"Failed to get summary from Ollama server. Status code: {response.status_code}")
- return None
- except requests.RequestException as e:
- print(f"An error occurred while sending request to Ollama server: {e}")
- return None
- def optimize_search_query(query, ollama_url="http://localhost:11434/api/generate"):
- # Prepare the prompt for optimizing the search query
- prompt = f"Optimize the following natural language query to improve its effectiveness in a web search.\
- Make it very concise. query: '{query}'"
-
- # Create the payload for the POST request
- payload = {
- "model": "command-r",
- "prompt": prompt,
- "stream": False,
- "max_tokens": 50
- }
-
- # Send the POST request to the Ollama server
- try:
- print("Optimizing search query")
- response = requests.post(ollama_url, json=payload)
- if response.status_code == 200:
- result = json.loads(response.text)["response"].strip()
- return result.strip('"')
- else:
- print(f"Failed to optimize search query from Ollama server. Status code: {response.status_code}")
- return query
- except requests.RequestException as e:
- print(f"An error occurred while sending request to Ollama server for optimizing the search query: {e}")
- return query
- def pretty_print_markdown(markdown_text):
- console = Console()
- md = Markdown(markdown_text)
- console.print(md)
-
- if __name__ == "__main__":
- # Set up argument parser
- parser = argparse.ArgumentParser(description="Search DuckDuckGo, extract text from results, and summarize with Ollama.")
- parser.add_argument("query", type=str, help="The search query to use on DuckDuckGo")
- parser.add_argument("--num_results", type=int, default=5, help="Number of search results to process (default: 5)")
-
- # Parse arguments
- args = parser.parse_args()
-
- original_query = args.query
- # Optimize the search query
- optimized_query = optimize_search_query(original_query)
- print(f"Original Query: {original_query}")
- print(f"Optimized Query: {optimized_query}")
- n = args.num_results # Number of results to extract
- links = duckduckgo_search(optimized_query, n)
-
- print(f"Top {n} search results:")
- for i, link in enumerate(links, start=1):
- print(f"{i}. {link}")
-
- texts_and_urls = extract_text_from_links(links)
-
- print("Summarizing individual search results")
- intermediate_summaries = summarize_individual_texts(texts_and_urls, original_query)
-
- final_summary = summarize_with_ollama(intermediate_summaries, original_query)
-
- if final_summary:
- print("\nFinal Summary of search results:\n")
- pretty_print_markdown(final_summary)
|