SpiderBase
import requests from urllib import parse import random from spider import setting import execjs class BaseClass(type): def __new__(cls, name, base, attrs): attrs["cw_func"] = [] count = 0 for k, v in attrs.items(): if k.startswith("crawl_"): attrs["cw_func"].append(k) count += 1 attrs["func_count"] = count if name == "BaseABC": attrs["setting"] = setting for key in dir(setting): if not key.startswith("__"): attrs[key] = eval("setting.{}".format(key)) return type.__new__(cls, name, base, attrs) class BaseABC(metaclass=BaseClass): pass class SpiderBase(BaseABC): def __init__(self): self.session = requests.Session() def download_page(self,url,**kwargs): cookies = kwargs.pop("cookies","") method = kwargs.pop("method","get") if not isinstance(cookies,dict): cookies = self.cookies(cookies) if method == "get": if cookies: resp = self.session.get(url,cookies=cookies,**kwargs) else: resp = self.session.get(url, **kwargs) elif method == "post": if cookies: resp = self.session.post(url,cookies=cookies,**kwargs) else: resp = self.session.post(url, **kwargs) return resp def _download_page(self,**kwargs): print(kwargs) def crawler(self,url): pass def extract_data(self,html): pass @property def headers(self): headers = { "User-Agent": random.choice(self.ua) } return headers def cookies(self,cookies): cookie_dict = {i.split("=")[0]: i.split("=")[-1] for i in cookies.split("; ")} return cookie_dict def url(self,**kargs): _url = kargs.pop("url","") search_key = kargs.pop("search_key","") if search_key: search_key = parse.quote(search_key) _url = _url.format(search_key) return _url def schedule(self): pass def re_exract(self, html_text,pattern,filename=None): import re if html_text: html = html_text elif filename: with open(filename, 'r', encoding='utf-8') as f: html = f.readlines() html = "".join(html) pat = re.compile(pattern) ret = pat.findall(html) return ret def save_to_file(self, text,filename): with open(filename, 'w', encoding='utf-8') as f: f.writelines(text) def save_to_db(self,db_object): pass def exec_js(self,js_file_path): with open(js_file_path, encoding='utf-8') as f: content = f.read() js = execjs.compile(content) return js if __name__ == '__main__': sb = SpiderBase() search_key = "test" url = "url" sb.url(url=url,search_key=search_key) print(sb.setting.deny)
Setting.py
一些配置内容
ua = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60", "Opera/8.0 (Windows NT 5.1; U; en)", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)" ] deny=[] MYSQL={ "host":"localhost", "user":"root", "password":"root", "port":3306, "db":"movie", "charset":"utf8" }
免责声明:文章中涉及的程序(方法)可能带有攻击性,仅供安全研究与教学之用,读者将其信息做其他用途,由读者承担全部法律及连带责任,本站不承担任何法律及连带责任;如有问题可邮件联系(建议使用企业邮箱或有效邮箱,避免邮件被拦截,联系方式见首页),望知悉。
- 左青龙
- 微信扫一扫
-
- 右白虎
- 微信扫一扫
-
评论