1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
| import requests from bs4 import BeautifulSoup import time import os import random import json
def baidu_search(keyword, page_num=3): user_agents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", ]
headers = { "User-Agent": random.choice(user_agents), "Accept-Language": "zh-CN,zh;q=0.9", } results = [] added_links = set()
for page in range(page_num): pn = page * 10 search_url = f"https://www.baidu.com/s?wd={requests.utils.quote(keyword)}&pn={pn}" print(search_url)
try: response = requests.get(search_url, headers=headers, timeout=10) response.raise_for_status() print(f"第 {page+1} 页请求成功,状态码:{response.status_code}") response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, 'html.parser')
containers = soup.find_all('div', class_='c-container')
for container in containers: title_tag = container.find('h3', class_='t') if not title_tag or not title_tag.a: continue
title = title_tag.get_text(strip=True) link = title_tag.a['href']
if link in added_links: continue
abstract = "" abstract_tag = container.find('div', class_='c-abstract') or \ container.find('div', class_='content-right_8Zs40') or \ container.find('span', class_='c-font-normal') or \ container.find('span', class_='content-right_2s-H4') or \ container.find('div', class_='c-span-last') or \ container.find('span', class_='content-right_1THTn') or \ container.find('div', class_='right-link_NlGkt')
if abstract_tag: abstract = abstract_tag.get_text(strip=True).replace('\n', ' ')
results.append({ 'title': title, 'abstract': abstract if abstract else "暂无摘要信息" }) added_links.add(link)
if len(results) >= 10 * page_num: break time.sleep(random.uniform(1, 3))
except Exception as e: print(f"第 {page + 1} 页请求异常:{str(e)}")
return results
def save_results_to_file(results, keyword): filename = os.path.join(os.getcwd(), f"{keyword}_search_results.json")
with open(filename, 'w', encoding='utf-8') as f: json.dump({ "keyword": keyword, "total": len(results), "results": results }, f, ensure_ascii=False, indent=2)
print(f"√ 搜索结果已保存至 {filename}")
if __name__ == "__main__": keyword = "李白" search_results = baidu_search(keyword, page_num=10)
if search_results: print("search success") save_results_to_file(search_results, keyword) else: print("× 搜索结果获取失败,请检查网络或重试")
|