# 基本用法
python script.py --cookie "your_cookie_here"
# 限制采集页数
python script.py --cookie "your_cookie" --pages 5
import argparse
import os
import re
import hashlib
import unicodedata
import time
from urllib.parse import urljoin, unquote
from bs4 import BeautifulSoup
import requests
import mimetypes
# ================== 工具函数 ==================
def sanitize_filename(filename, is_attachment=False):
clean_name = unicodedata.normalize('NFKC', filename)
clean_name = re.sub(r'[\/*?:"<> |]', '_', clean_name).strip()
max_len = 120 if is_attachment else 100
return clean_name[:max_len] + ('' if is_attachment else '.html')
def extract_post_id(link):
match = re.search(r'/post/(d+)/', link)
return match.group(1) if match else 'unknown'
def process_resources(session, soup, post_id, base_dir):
img_dir = os.path.join(base_dir, 'images')
attach_dir = os.path.join(base_dir, 'attachments')
os.makedirs(img_dir, exist_ok=True)
os.makedirs(attach_dir, exist_ok=True)
for img in soup.find_all('img'):
try:
img_url = urljoin('https://src.sjtu.edu.cn', img['src'])
response = session.get(img_url, stream=True)
response.raise_for_status()
file_hash = hashlib.md5(img_url.encode()).hexdigest()
ext = mimetypes.guess_extension(response.headers['Content-Type'].split(';')[0]) or '.jpg' # 默认使用.jpg
filename = f"{file_hash}{ext}"
with open(os.path.join(img_dir, filename), 'wb') as f:
for chunk in response.iter_content(1024 * 1024):
f.write(chunk)
img['src'] = f'images/{filename}'
except Exception as e:
print(f"图片处理失败: {str(e)}")
img.decompose()
for a in soup.find_all('a', href=re.compile(r'/post/attach/')):
try:
attach_url = urljoin('https://src.sjtu.edu.cn', a['href'])
response = session.get(attach_url, stream=True)
response.raise_for_status()
ext = extract_from_header(response)
if ext and not ext.startswith('.'):
ext = f'.{ext}'
safe_name = sanitize_filename(a.text.strip(), is_attachment=True)
filename = f"{post_id}_{safe_name}{ext}"
with open(os.path.join(attach_dir, filename), 'wb') as f:
for chunk in response.iter_content(1024 * 1024):
f.write(chunk)
a['href'] = f'attachments/{filename}'
except Exception as e:
print(f"附件处理失败: {str(e)}")
a.decompose()
return soup
def extract_from_header(response):
content_disp = response.headers.get("Content-Disposition", "")
try:
encoded_match = re.search(r"filename*=UTF-8''(.+?)(?:; |$)", content_disp)
if encoded_match:
decoded_name = unquote(encoded_match.group(1))
return os.path.splitext(decoded_name)[1]
filename_match = re.search(r'filename="?([^;"]+?)"?;?', content_disp)
if filename_match:
decoded_name = unquote(filename_match.group(1))
return os.path.splitext(decoded_name)[1]
except (IndexError, AttributeError, re.error) as e:
print(f"提取扩展名失败: {str(e)}")
return None
def save_posts(session, links):
if not os.path.exists('report'):
os.makedirs('report')
for idx, link in enumerate(links, 1):
try:
post_id = extract_post_id(link)
post_url = f"https://src.sjtu.edu.cn{link}"
response = session.get(post_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
article = soup.find('article', class_='am-article')
if not article or "该漏洞已经审核通过7天以上" in response.text:
print(f"跳过无内容报告:{link}")
continue
processed_soup = process_resources(session, article, post_id, 'report')
title_tag = soup.find('h1', class_='am-article-title')
if not title_tag:
print(f"跳过无标题报告: {link}")
continue
base_filename = sanitize_filename(f"[{post_id}]{title_tag.get_text(strip=True)}")
with open(f"report/{base_filename}", 'w', encoding='utf-8') as f:
f.write(str(processed_soup))
print(f"已保存报告 [{idx}/{len(links)}] {base_filename}")
except Exception as e:
print(f"处理失败 {link}: {str(e)}")
def get_max_page(session, base_url):
try:
response = session.get(base_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
max_page = 1
page_pattern = re.compile(r'?page=(d+)')
for a in soup.find_all('a', href=True):
match = page_pattern.search(a['href'])
if match:
current_page = int(match.group(1))
max_page = max(max_page, current_page)
return max_page
except Exception as e:
print(f"获取最大页码失败: {str(e)}")
return None
def collect_links(session, base_url, max_pages):
collected_links = set()
try:
for page in range(1, max_pages + 1):
url = f"{base_url}?page={page}"
response = session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
for a in soup.select('a[href*="/post/"]'):
href = a['href']
if re.match(r'^/post/d+/$', href):
collected_links.add(href)
time.sleep(1) # 礼貌性延迟
return list(collected_links)
except Exception as e:
print(f"页面收集失败: {str(e)}")
return []
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='EDUSRC漏洞报告保存工具')
parser.add_argument('--cookie', required=True, help='用户认证Cookie')
parser.add_argument('--pages', type=int, help='指定获取页数(可选)')
args = parser.parse_args()
headers = {
'Cookie': args.cookie,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
with requests.Session() as s:
s.headers.update(headers)
max_page = get_max_page(s, 'https://src.sjtu.edu.cn/profile/post/')
if not max_page:
exit(1)
target_pages = args.pages or max_page
actual_pages = min(target_pages, max_page)
print(f"计划采集页数: {actual_pages}/{max_page}")
post_links = collect_links(s, 'https://src.sjtu.edu.cn/profile/post/', actual_pages)
print(f"发现有效报告: {len(post_links)}条")
if post_links:
save_posts(s, post_links)
原文始发于微信公众号(墨雪飘影):轻量级EDUSRC文章备份工具
免责声明:文章中涉及的程序(方法)可能带有攻击性,仅供安全研究与教学之用,读者将其信息做其他用途,由读者承担全部法律及连带责任,本站不承担任何法律及连带责任;如有问题可邮件联系(建议使用企业邮箱或有效邮箱,避免邮件被拦截,联系方式见首页),望知悉。
- 左青龙
- 微信扫一扫
-
- 右白虎
- 微信扫一扫
-
评论