瀏覽代碼

Delete 'main.py'

dmsc 1 周之前
父節點
當前提交
9f9594b185
共有 1 個文件被更改,包括 0 次插入236 次删除
  1. 0 236
      main.py

+ 0 - 236
main.py

@@ -1,236 +0,0 @@
-import os
-import re
-from openai import OpenAI
-from dotenv import load_dotenv
-import requests
-from bs4 import BeautifulSoup
-import json
-import argparse
-from rich.console import Console
-from rich.markdown import Markdown
-from multiprocessing import Pool
-import multiprocessing as mp
-
-
-def duckduckgo_search(query, num_results=5):
-    # Construct the DuckDuckGo URL for the search query
-    url = f"https://html.duckduckgo.com/html/?q={query}"
-
-    # Send a GET request to the DuckDuckGo search page
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
-    }
-
-    response = requests.get(url, headers=headers)
-
-    # Check if the request was successful
-    if response.status_code != 200:
-        print(
-            f"Failed to retrieve search results. Status code: {response.status_code}")
-        return []
-
-    # Parse the HTML content using BeautifulSoup
-    soup = BeautifulSoup(response.content, 'html.parser')
-
-    # Find all result links (assuming they are in <a> tags with class "result__a")
-    result_links = []
-    for a_tag in soup.find_all('a', class_='result__a'):
-        link = a_tag.get('href')
-        if link:
-            result_links.append(link)
-            if len(result_links) >= num_results:
-                break
-
-    return result_links
-
-
-def extract_text_from_links(links, timeout=5) -> list[tuple[str, str]]:
-    extracted_texts = []
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
-    }
-
-    for link in links:
-        try:
-            response = requests.get(link, headers=headers, timeout=timeout)
-            if response.status_code == 200:
-                soup = BeautifulSoup(response.content, 'html.parser')
-                # Extract text from the page
-                text = soup.get_text(separator='\n', strip=True)
-                extracted_texts.append((link, text))
-            else:
-                print(
-                    f"Failed to retrieve content from {link}. Status code: {response.status_code}")
-        except requests.RequestException as e:
-            print(f"An error occurred while fetching {link}: {e}")
-
-    return extracted_texts
-
-def remove_tags(text):        
-    # Regular expression pattern to match '<think>' tags and their contents
-    pattern = r'<think>[\s\S]*?<\/think>\n\n'
-    # Replace all matches with an empty string
-    result = re.sub(pattern, '', text)
-    return result
-
-def process_url(args):
-    "Helper function to summarize one individual source"
-    url, text, query, model, api_base, token = args
-    
-    client = OpenAI(
-        base_url=api_base,
-        api_key=token
-    )
-
-    prompt = f"""You are an expert summarizer. Your task is to evaluate which parts if any of the
-    following document are relevant to the user's query. Summarize the relevant parts of the given text with regards to the original query: '{query}'\
-    and include the full URLs as references where appropriate. Leave out everything that has little or no relevance to the query. If the text does not 
-    contain relevant information just return an empty response.
-    \n\n{text}"""
-
-    history = [{"role": "user", "content": prompt}]
-
-    try:
-        response = client.chat.completions.create(
-            model=model,
-            messages=history,
-            temperature=0,
-            max_tokens=1000
-        ).choices.pop().message.content        
-        return (url, remove_tags(response))
-    except Exception as e:
-        print(
-            f"An error occurred at summarization for {url}: {e}"
-        )
-        return (url, "")  # Return empty string on error
-
-
-def summarize_individual_texts(texts_and_urls, query, model, api_base, token):
-    # Generate text summaries in parallel using multiprocessing
-    args_list = [(url, text, query, model, api_base, token) for url, text in texts_and_urls]
-    
-    # Get number of CPUs to use
-    num_processes = mp.cpu_count()
-
-    # Create a process pool and process URLs in parallel
-    with Pool(processes=num_processes) as pool:
-        summaries = pool.map(process_url, args_list)
-    
-    return summaries
-
-
-def summarize(texts_and_urls, query, model, api_base, token):
-    # Prepare the context and prompt
-    context = "\n".join(
-        [f"URL: {url}\nText: {text}" for url, text in texts_and_urls])
-    prompt = f"""You are an expert summarizer. Your task is to evaluate which parts if any of the
-        following document are relevant to the user's query. Summarize the relevant parts of the given text with regards to the original query: '{query}' \
-        and include the full URLs as references where appropriate. Use markdown to format your response. Add unicode characters where it makes sense to make the summary colorful. \
-        \n\n{context}"""
-
-    client = OpenAI(
-        base_url=api_base,
-        api_key=token
-    )
-
-    history = [{"role": "user", "content": prompt}]
-
-    try:
-        response = client.chat.completions.create(
-            model=model,
-            messages=history,
-            temperature=0,
-            max_tokens=2000
-        ).choices.pop().message.content
-        return remove_tags(response)
-    except Exception as e:
-        print(
-            f"An error occurred at summarization for {url}: {e}"
-        )
-
-
-def optimize_search_query(query, query_model, api_base):
-    # Prepare the prompt for optimizing the search query
-    prompt = f"Optimize the following natural language query to improve its effectiveness in a web search.\
-        Make it very concise. Return only exactly the optimized query text no additional texts, quotations or thoughts. Query: '{query}'"
-
-    # Create the payload for the POST request
-    payload = {
-        "model": query_model,
-        "prompt": prompt,
-        "stream": False,
-        "max_tokens": 50
-    }
-
-    # Send the POST request to the server
-    try:
-        print("Optimizing search query")
-        response = requests.post(api_base, json=payload)
-        if response.status_code == 200:
-            result = json.loads(response.text)
-            return (result["choices"][0]["text"].strip())
-        else:
-            print(
-                f"Failed to optimize search query from the server. Status code: {response.status_code}")
-            return query
-    except Exception as e:
-        print(
-            f"An error occurred while sending request to the server for optimizing the search query: {e}")
-        return query
-
-
-def pretty_print_markdown(markdown_text):
-    console = Console()
-    md = Markdown(markdown_text)
-    console.print(md)
-
-
-if __name__ == "__main__":
-    load_dotenv()
-    token = os.getenv("OVH_AI_ENDPOINTS_ACCESS_TOKEN")
-    api_base = os.getenv("API_BASE")    
-    summary_model = os.getenv("SUMMARY_MODEL")
-
-    # Set up argument parser
-    parser = argparse.ArgumentParser(
-        description="Search DuckDuckGo, extract text from results, and summarize with LLM.")
-    parser.add_argument("query", type=str,
-                        help="The search query to use on DuckDuckGo")
-    parser.add_argument("--num_results", type=int, default=5,
-                        help="Number of search results to process (default: 5)")
-
-    # Parse arguments
-    args = parser.parse_args()
-
-    
-    original_query = args.query
-    
-    print(f"Query: {original_query}")
-
-    n = args.num_results  # Number of results to extract
-    links = duckduckgo_search(original_query, n)
-
-    print(f"Top {n} search results:")
-    for i, link in enumerate(links, start=1):
-        print(f"{i}. {link}")
-
-    texts_and_urls = extract_text_from_links(links)
-
-    print("Summarizing individual search results")
-    intermediate_summaries = summarize_individual_texts(
-        texts_and_urls,
-        original_query,
-        summary_model,
-        api_base,
-        token
-    )
-
-    final_summary = summarize(
-        intermediate_summaries,
-        original_query,
-        summary_model,
-        api_base,
-        token)
-
-    if final_summary:
-        pretty_print_markdown(final_summary)