1. 爬虫基础
import requests
from bs4 import BeautifulSoup
url = 'https://www.baidu.com'
response = requests.get(url)
if response.status_code == 200:
html_content = response.content.decode('utf-8') # 手动解码
soup = BeautifulSoup(html_content, 'html.parser')
# 获取网页标题
title = soup.title.string
print('网页标题:', title)
# 获取所有链接
links = soup.find_all('a')
for link in links:
print('链接:', link.get('href'))
else:
print('Failed to retrieve the webpage')
2. 数据采集与分析
# 豆瓣电影Top250排行榜
import requests
from bs4 import BeautifulSoup
url = 'https://movie.douban.com/top250'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
# 定位包含电影条目的元素
movie_items = soup.find_all(class_='item')
for item in movie_items:
# 提取电影名称
movie_name = item.find(class_='title').get_text()
# 提取评分
rating = item.find(class_='rating_num').get_text()
# 输出电影名称和评分
print('电影名称:', movie_name)
print('评分:', rating)
print('----------------------')
else:
print('Failed to fetch the page')
import requests
from bs4 import BeautifulSoup
url = 'https://movie.douban.com/top250'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
# 定位包含电影条目的元素
movie_items = soup.find_all(class_='item')
# 创建一个列表来存储电影名称和评分
movies = []
for item in movie_items:
# 提取电影名称
movie_name = item.find(class_='title').get_text()
# 提取评分
rating = item.find(class_='rating_num').get_text()
# 将电影名称和评分组成字典并添加到列表中
movies.append({'name': movie_name, 'rating': float(rating)})
# 按照评分排序
sorted_movies = sorted(movies, key=lambda x: x['rating'], reverse=True)
# 输出排序后的电影名称和评分
for movie in sorted_movies:
print('电影名称:', movie['name'])
print('评分:', movie['rating'])
print('----------------------')
else:
print('Failed to fetch the page')
##保存结果到excel文件中
import openpyxl
workbook = openpyxl.Workbook()
sheet = workbook.active
sheet.append(['电影名称', '评分'])
for movie in sorted_movies:
sheet.append([movie['name'], movie['rating']])
workbook.save(r'C:UsersralapDesktopPython豆瓣电影Top250.xlsx')
5. 注意事项与道德规范
robots.txt
文件了解爬取策略。原文始发于微信公众号(网络个人修炼):使用Python进行网页爬取
免责声明:文章中涉及的程序(方法)可能带有攻击性,仅供安全研究与教学之用,读者将其信息做其他用途,由读者承担全部法律及连带责任,本站不承担任何法律及连带责任;如有问题可邮件联系(建议使用企业邮箱或有效邮箱,避免邮件被拦截,联系方式见首页),望知悉。
- 左青龙
- 微信扫一扫
-
- 右白虎
- 微信扫一扫
-
评论