|
|
@@ -36,15 +36,16 @@ def duckduckgo_search(query, num_results=5):
|
|
|
return result_links
|
|
|
|
|
|
|
|
|
-def extract_text_from_links(links):
|
|
|
+def extract_text_from_links(links, timeout=5):
|
|
|
extracted_texts = []
|
|
|
headers = {
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
|
|
|
}
|
|
|
|
|
|
for link in links:
|
|
|
+ print("downloading text from: " + link)
|
|
|
try:
|
|
|
- response = requests.get(link, headers=headers)
|
|
|
+ response = requests.get(link, headers=headers, timeout=timeout)
|
|
|
if response.status_code == 200:
|
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
# Extract text from the page
|