dmsc
/
sllm


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
							import os
import re
from openai import OpenAI
from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup
import json
import argparse
from rich.console import Console
from rich.markdown import Markdown
from multiprocessing import Pool
import multiprocessing as mp


def duckduckgo_search(query, num_results=5):
    # Construct the DuckDuckGo URL for the search query
    url = f"https://html.duckduckgo.com/html/?q={query}"

    # Send a GET request to the DuckDuckGo search page
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code != 200:
        print(
            f"Failed to retrieve search results. Status code: {response.status_code}")
        return []

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all result links (assuming they are in <a> tags with class "result__a")
    result_links = []
    for a_tag in soup.find_all('a', class_='result__a'):
        link = a_tag.get('href')
        if link:
            result_links.append(link)
            if len(result_links) >= num_results:
                break

    return result_links


def extract_text_from_links(links, timeout=5) -> list[tuple[str, str]]:
    extracted_texts = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

    for link in links:
        try:
            response = requests.get(link, headers=headers, timeout=timeout)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                # Extract text from the page
                text = soup.get_text(separator='\n', strip=True)
                extracted_texts.append((link, text))
            else:
                print(
                    f"Failed to retrieve content from {link}. Status code: {response.status_code}")
        except requests.RequestException as e:
            print(f"An error occurred while fetching {link}: {e}")

    return extracted_texts

def remove_tags(text):        
    # Regular expression pattern to match '<think>' tags and their contents
    pattern = r'<think>[\s\S]*?<\/think>\n\n'
    # Replace all matches with an empty string
    result = re.sub(pattern, '', text)
    return result

def process_url(args):
    "Helper function to summarize one individual source"
    url, text, query, model, api_base, token = args
    
    client = OpenAI(
        base_url=api_base,
        api_key=token
    )

    prompt = f"""You are an expert summarizer. Your task is to evaluate which parts if any of the
    following document are relevant to the user's query. Summarize the relevant parts of the given text with regards to the original query: '{query}'\
    and include the full URLs as references where appropriate. Leave out everything that has little or no relevance to the query. If the text does not 
    contain relevant information just return an empty response.
    \n\n{text}"""

    history = [{"role": "user", "content": prompt}]

    try:
        response = client.chat.completions.create(
            model=model,
            messages=history,
            temperature=0,
            max_tokens=1000
        ).choices.pop().message.content        
        return (url, remove_tags(response))
    except requests.RequestException as e:
        print(
            f"An error occurred at summarization for {url}: {e}"
        )
        return (url, "")  # Return empty string on error


def summarize_individual_texts(texts_and_urls, query, model, api_base, token):
    # Generate text summaries in parallel using multiprocessing
    args_list = [(url, text, query, model, api_base, token) for url, text in texts_and_urls]
    
    # Get number of CPUs to use
    num_processes = mp.cpu_count()

    # Create a process pool and process URLs in parallel
    with Pool(processes=num_processes) as pool:
        summaries = pool.map(process_url, args_list)
    
    return summaries


def summarize(texts_and_urls, query, model, api_base, token):
    # Prepare the context and prompt
    context = "\n".join(
        [f"URL: {url}\nText: {text}" for url, text in texts_and_urls])
    prompt = f"""You are an expert summarizer. Your task is to evaluate which parts if any of the
        following document are relevant to the user's query. Summarize the relevant parts of the given text with regards to the original query: '{query}' \
        and include the full URLs as references where appropriate. Use markdown to format your response. Add unicode characters where it makes sense to make the summary colorful. \
        \n\n{context}"""

    client = OpenAI(
        base_url=api_base,
        api_key=token
    )

    history = [{"role": "user", "content": prompt}]

    try:
        response = client.chat.completions.create(
            model=model,
            messages=history,
            temperature=0,
            max_tokens=2000
        ).choices.pop().message.content
        return remove_tags(response)
    except requests.RequestException as e:
        print(
            f"An error occurred at summarization for {url}: {e}"
        )


def optimize_search_query(query, query_model, api_base):
    # Prepare the prompt for optimizing the search query
    prompt = f"Optimize the following natural language query to improve its effectiveness in a web search.\
        Make it very concise. Return only the optimized query text no additional texts, quotations or thoughts. Query: '{query}'"

    # Create the payload for the POST request
    payload = {
        "model": query_model,
        "prompt": prompt,
        "stream": False,
        "max_tokens": 50
    }

    # Send the POST request to the server
    try:
        print("Optimizing search query")
        response = requests.post(api_base, json=payload)
        if response.status_code == 200:
            result = json.loads(response.text)
            return (result["choices"][0]["text"].strip())
        else:
            print(
                f"Failed to optimize search query from the server. Status code: {response.status_code}")
            return query
    except requests.RequestException as e:
        print(
            f"An error occurred while sending request to the server for optimizing the search query: {e}")
        return query


def pretty_print_markdown(markdown_text):
    console = Console()
    md = Markdown(markdown_text)
    console.print(md)


if __name__ == "__main__":
    load_dotenv()
    token = os.getenv("OVH_AI_ENDPOINTS_ACCESS_TOKEN")

    # Set up argument parser
    parser = argparse.ArgumentParser(
        description="Search DuckDuckGo, extract text from results, and summarize with LLM.")
    parser.add_argument("query", type=str,
                        help="The search query to use on DuckDuckGo")
    parser.add_argument("--num_results", type=int, default=5,
                        help="Number of search results to process (default: 5)")

    # Parse arguments
    args = parser.parse_args()

    api_base = "https://oai.endpoints.kepler.ai.cloud.ovh.net/v1/"
    original_query = args.query
    summary_model = "Qwen3-32B"

    print(f"Query: {original_query}")

    n = args.num_results  # Number of results to extract
    links = duckduckgo_search(original_query, n)

    print(f"Top {n} search results:")
    for i, link in enumerate(links, start=1):
        print(f"{i}. {link}")

    texts_and_urls = extract_text_from_links(links)

    print("Summarizing individual search results")
    intermediate_summaries = summarize_individual_texts(
        texts_and_urls,
        original_query,
        summary_model,
        api_base,
        token
    )

    final_summary = summarize(
        intermediate_summaries,
        original_query,
        summary_model,
        api_base,
        token)

    if final_summary:
        print("\n################################# Final Summary of search results ################################# \n")        
        pretty_print_markdown(final_summary)