main.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import json
  4. import argparse
  5. from rich.console import Console
  6. from rich.markdown import Markdown
  7. def duckduckgo_search(query, num_results=5):
  8. # Construct the DuckDuckGo URL for the search query
  9. url = f"https://html.duckduckgo.com/html/?q={query}"
  10. # Send a GET request to the DuckDuckGo search page
  11. headers = {
  12. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
  13. }
  14. response = requests.get(url, headers=headers)
  15. # Check if the request was successful
  16. if response.status_code != 200:
  17. print(f"Failed to retrieve search results. Status code: {response.status_code}")
  18. return []
  19. # Parse the HTML content using BeautifulSoup
  20. soup = BeautifulSoup(response.content, 'html.parser')
  21. # Find all result links (assuming they are in <a> tags with class "result__a")
  22. result_links = []
  23. for a_tag in soup.find_all('a', class_='result__a'):
  24. link = a_tag.get('href')
  25. if link:
  26. result_links.append(link)
  27. if len(result_links) >= num_results:
  28. break
  29. return result_links
  30. def extract_text_from_links(links, timeout=5):
  31. extracted_texts = []
  32. headers = {
  33. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
  34. }
  35. for link in links:
  36. try:
  37. response = requests.get(link, headers=headers, timeout=timeout)
  38. if response.status_code == 200:
  39. soup = BeautifulSoup(response.content, 'html.parser')
  40. # Extract text from the page
  41. text = soup.get_text(separator='\n', strip=True)
  42. extracted_texts.append((link, text))
  43. else:
  44. print(f"Failed to retrieve content from {link}. Status code: {response.status_code}")
  45. except requests.RequestException as e:
  46. print(f"An error occurred while fetching {link}: {e}")
  47. return extracted_texts
  48. def summarize_individual_texts(texts_and_urls, query, model, api_url="http://localhost:8000/api/v1/completions"):
  49. summaries = []
  50. for url, text in texts_and_urls:
  51. prompt = f"Extract the relevant information from the following text with regards to the original \
  52. query: '{query}'\n\n{text}\n"
  53. payload = {
  54. "model": model,
  55. "prompt": prompt,
  56. "stream": False,
  57. "max_tokens": 1000,
  58. "options": {
  59. "num_ctx": 16384
  60. }
  61. }
  62. try:
  63. response = requests.post(api_url, json=payload)
  64. if response.status_code == 200:
  65. #result = json.loads(response.text)["response"]
  66. result_json = json.loads(response.text)
  67. print(result_json)
  68. result = result_json["choices"][0]["text"].strip()
  69. summaries.append((url, result))
  70. else:
  71. print(f"Failed to get summary from server for {url}. Status code: {response.status_code}")
  72. except requests.RequestException as e:
  73. print(f"An error occurred while sending request to server for {url}: {e}")
  74. return summaries
  75. def summarize(texts_and_urls, query, model, api_url="http://localhost:8000/api/v1/completions"):
  76. # Prepare the context and prompt
  77. context = "\n".join([f"URL: {url}\nText: {text}" for url, text in texts_and_urls])
  78. prompt = f"Summarize the following search results with regards to the original query: '{query}' \
  79. and include the full URLs as references where appropriate. Use markdown to format your response. Add unicode characters where it makes sense to make the summary colorful. \
  80. \n\n{context}"
  81. # Create the payload for the POST request
  82. payload = {
  83. "model": model,
  84. "prompt": prompt,
  85. "stream": False,
  86. # "max_tokens": 1500,
  87. "options": {
  88. "num_ctx": 16384
  89. }
  90. }
  91. # Send the POST request to the server
  92. try:
  93. print("Processing")
  94. response = requests.post(api_url, json=payload)
  95. if response.status_code == 200:
  96. #result = json.loads(response.text)["response"]
  97. #return result
  98. result = json.loads(response.text)
  99. return(result["choices"][0]["text"].strip())
  100. else:
  101. print(f"Failed to get summary from the server. Status code: {response.status_code}")
  102. return None
  103. except requests.RequestException as e:
  104. print(f"An error occurred while sending request to the server: {e}")
  105. return None
  106. def optimize_search_query(query, query_model, api_url="http://localhost:8000/api/v1/completions"):
  107. # Prepare the prompt for optimizing the search query
  108. prompt = f"Optimize the following natural language query to improve its effectiveness in a web search.\
  109. Make it very concise. Return only the optimized query text no additional texts, quotations or thoughts. Query: '{query}'"
  110. # Create the payload for the POST request
  111. payload = {
  112. "model": query_model,
  113. "prompt": prompt,
  114. "stream": False,
  115. "max_tokens": 50
  116. }
  117. # Send the POST request to the server
  118. try:
  119. print("Optimizing search query")
  120. response = requests.post(api_url, json=payload)
  121. if response.status_code == 200:
  122. result = json.loads(response.text)
  123. return(result["choices"][0]["text"].strip())
  124. else:
  125. print(f"Failed to optimize search query from the server. Status code: {response.status_code}")
  126. return query
  127. except requests.RequestException as e:
  128. print(f"An error occurred while sending request to the server for optimizing the search query: {e}")
  129. return query
  130. def pretty_print_markdown(markdown_text):
  131. console = Console()
  132. md = Markdown(markdown_text)
  133. console.print(md)
  134. if __name__ == "__main__":
  135. # Set up argument parser
  136. parser = argparse.ArgumentParser(description="Search DuckDuckGo, extract text from results, and summarize with LLM.")
  137. parser.add_argument("query", type=str, help="The search query to use on DuckDuckGo")
  138. parser.add_argument("--num_results", type=int, default=5, help="Number of search results to process (default: 5)")
  139. # Parse arguments
  140. args = parser.parse_args()
  141. original_query = args.query
  142. query_model = "Gemma-3-4b-it-GGUF"
  143. #query_model = "Qwen3-1.7B-GGUF"
  144. #summary_model = "Qwen3-4B-Instruct-2507-GGUF"
  145. summary_model = "Qwen3-1.7B-GGUF"
  146. #final_summary_model = "Gemma-3-4b-it-GGUF"
  147. final_summary_model = "Qwen3-1.7B-GGUF"
  148. # Optimize the search query
  149. optimized_query = optimize_search_query(original_query, query_model)
  150. print(f"Original Query: {original_query}")
  151. print(f"Optimized Query: {optimized_query}")
  152. n = args.num_results # Number of results to extract
  153. links = duckduckgo_search(optimized_query, n)
  154. print(f"Top {n} search results:")
  155. for i, link in enumerate(links, start=1):
  156. print(f"{i}. {link}")
  157. texts_and_urls = extract_text_from_links(links)
  158. print("Summarizing individual search results")
  159. intermediate_summaries = summarize_individual_texts(texts_and_urls, original_query, summary_model)
  160. final_summary = summarize(intermediate_summaries, original_query, final_summary_model)
  161. if final_summary:
  162. print("\nFinal Summary of search results:\n")
  163. pretty_print_markdown(final_summary)