1
0

2 کامیت‌ها e7fccaf8fb ... 3ad7659a36

نویسنده SHA1 پیام تاریخ
  david 3ad7659a36 switch to inquirer 1 هفته پیش
  david c4d1245a60 rename main.py to sllm 1 هفته پیش
1فایلهای تغییر یافته به همراه237 افزوده شده و 0 حذف شده
  1. 237 0
      sllm

+ 237 - 0
sllm

@@ -0,0 +1,237 @@
+#!venv/bin/python
+import os
+import re
+from openai import OpenAI
+from dotenv import load_dotenv
+import requests
+from bs4 import BeautifulSoup
+import json
+import inquirer
+from rich.console import Console
+from rich.markdown import Markdown
+from multiprocessing import Pool
+import multiprocessing as mp
+
+
+def duckduckgo_search(query, num_results=10):
+    # Construct the DuckDuckGo URL for the search query
+    url = f"https://html.duckduckgo.com/html/?q={query}"
+
+    # Send a GET request to the DuckDuckGo search page
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
+    }
+
+    response = requests.get(url, headers=headers)
+
+    # Check if the request was successful
+    if response.status_code != 200:
+        print(
+            f"Failed to retrieve search results. Status code: {response.status_code}")
+        return []
+
+    # Parse the HTML content using BeautifulSoup
+    soup = BeautifulSoup(response.content, 'html.parser')
+
+    # Find all result links (assuming they are in <a> tags with class "result__a")
+    result_links = []
+    for a_tag in soup.find_all('a', class_='result__a'):
+        link = a_tag.get('href')
+        if link:
+            result_links.append(link)
+            if len(result_links) >= num_results:
+                break
+
+    return result_links
+
+
+def extract_text_from_links(links, timeout=5) -> list[tuple[str, str]]:
+    extracted_texts = []
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
+    }
+
+    for link in links:
+        try:
+            response = requests.get(link, headers=headers, timeout=timeout)
+            if response.status_code == 200:
+                soup = BeautifulSoup(response.content, 'html.parser')
+                # Extract text from the page
+                text = soup.get_text(separator='\n', strip=True)
+                extracted_texts.append((link, text))
+            else:
+                print(
+                    f"Failed to retrieve content from {link}. Status code: {response.status_code}")
+        except requests.RequestException as e:
+            print(f"An error occurred while fetching {link}: {e}")
+
+    return extracted_texts
+
+def remove_tags(text):        
+    # Regular expression pattern to match '<think>' tags and their contents
+    pattern = r'<think>[\s\S]*?<\/think>\n\n'
+    # Replace all matches with an empty string
+    result = re.sub(pattern, '', text)
+    return result
+
+def process_url(args):
+    "Helper function to summarize one individual source"
+    url, text, query, model, api_base, token = args
+    
+    client = OpenAI(
+        base_url=api_base,
+        api_key=token
+    )
+
+    prompt = f"""You are an expert summarizer. Your task is to evaluate which parts if any of the
+    following document are relevant to the user's query. Summarize the relevant parts of the given text with regards to the original query: '{query}'\
+    and include the full URLs as references where appropriate. Leave out everything that has little or no relevance to the query. If the text does not 
+    contain relevant information just return an empty response.
+    \n\n{text}"""
+
+    history = [{"role": "user", "content": prompt}]
+
+    try:
+        response = client.chat.completions.create(
+            model=model,
+            messages=history,
+            temperature=0,
+            max_tokens=1000
+        ).choices.pop().message.content        
+        return (url, remove_tags(response))
+    except Exception as e:
+        print(
+            f"An error occurred at summarization for {url}: {e}"
+        )
+        return (url, "")  # Return empty string on error
+
+
+def summarize_individual_texts(texts_and_urls, query, model, api_base, token):
+    # Generate text summaries in parallel using multiprocessing
+    args_list = [(url, text, query, model, api_base, token) for url, text in texts_and_urls]
+    
+    # Get number of CPUs to use
+    num_processes = mp.cpu_count()
+
+    # Create a process pool and process URLs in parallel
+    with Pool(processes=num_processes) as pool:
+        summaries = pool.map(process_url, args_list)
+    
+    return summaries
+
+
+def summarize(texts_and_urls, query, explanation, model, api_base, token):
+    # Prepare the context and prompt
+    context = "\n".join(
+        [f"URL: {url}\nText: {text}" for url, text in texts_and_urls])
+    prompt = f"""You are an expert summarizer. Your task is to evaluate which parts if any of the
+        following document are relevant to the user's query. Summarize the relevant parts of the given text 
+        with regards to the original query: '{query}' and the users additional optional explanation 
+        of the intended search (if available): '{explanation}'.    
+        Include the full URLs as references where appropriate. Use markdown to format your response.
+        Add unicode characters where it makes sense to make the summary colorful.
+        \n\n{context}"""
+
+    client = OpenAI(
+        base_url=api_base,
+        api_key=token
+    )
+
+    history = [{"role": "user", "content": prompt}]
+
+    try:
+        response = client.chat.completions.create(
+            model=model,
+            messages=history,
+            temperature=0,
+            max_tokens=2000
+        ).choices.pop().message.content
+        return remove_tags(response)
+    except Exception as e:
+        print(
+            f"An error occurred at summarization for {url}: {e}"
+        )
+
+
+def optimize_search_query(query, query_model, api_base):
+    # Prepare the prompt for optimizing the search query
+    prompt = f"Optimize the following natural language query to improve its effectiveness in a web search.\
+        Make it very concise. Return only exactly the optimized query text no additional texts, quotations or thoughts. Query: '{query}'"
+
+    # Create the payload for the POST request
+    payload = {
+        "model": query_model,
+        "prompt": prompt,
+        "stream": False,
+        "max_tokens": 50
+    }
+
+    # Send the POST request to the server
+    try:
+        print("Optimizing search query")
+        response = requests.post(api_base, json=payload)
+        if response.status_code == 200:
+            result = json.loads(response.text)
+            return (result["choices"][0]["text"].strip())
+        else:
+            print(
+                f"Failed to optimize search query from the server. Status code: {response.status_code}")
+            return query
+    except Exception as e:
+        print(
+            f"An error occurred while sending request to the server for optimizing the search query: {e}")
+        return query
+
+
+def pretty_print_markdown(markdown_text):
+    console = Console()
+    md = Markdown(markdown_text)
+    console.print(md)
+
+
+if __name__ == "__main__":
+    load_dotenv()
+    token = os.getenv("OVH_AI_ENDPOINTS_ACCESS_TOKEN")
+    api_base = os.getenv("API_BASE")    
+    summary_model = os.getenv("SUMMARY_MODEL")
+
+    questions = [
+        inquirer.Text('query', message="What's your query?"),
+        inquirer.Text('explanation', message="Additional context for the query?")
+    ]
+    answers = inquirer.prompt(questions)
+    query = answers["query"]
+    explanation = answers["explanation"]
+    
+    print(f"Query: {query}")
+    print(f"Explanation: {explanation}")
+
+    # n = args.num_results  # Number of results to extract
+    n = 10
+    links = duckduckgo_search(query, num_results=n)
+
+    print(f"Top {n} search results:")
+    for i, link in enumerate(links, start=1):
+        print(f"{i}. {link}")
+
+    texts_and_urls = extract_text_from_links(links)
+
+    print("Summarizing individual search results")
+    intermediate_summaries = summarize_individual_texts(
+        texts_and_urls,
+        query,
+        summary_model,
+        api_base,
+        token
+    )
+
+    final_summary = summarize(
+        intermediate_summaries,
+        query,
+        explanation,
+        summary_model,
+        api_base,
+        token)
+
+    if final_summary:
+        pretty_print_markdown(final_summary)