SpiderBase类和Setting.py

admin 2021年1月21日10:38:02评论168 views字数 3094阅读10分18秒阅读模式
摘要

一些配置内容


SpiderBase

import requests from urllib import parse import random from spider import setting import execjs  class BaseClass(type):     def __new__(cls, name, base, attrs):         attrs["cw_func"] = []         count = 0         for k, v in attrs.items():             if k.startswith("crawl_"):                 attrs["cw_func"].append(k)                 count += 1         attrs["func_count"] = count         if name == "BaseABC":             attrs["setting"] = setting             for key in dir(setting):                 if not key.startswith("__"):                     attrs[key] = eval("setting.{}".format(key))          return type.__new__(cls, name, base, attrs)    class BaseABC(metaclass=BaseClass):     pass   class SpiderBase(BaseABC):     def __init__(self):         self.session = requests.Session()       def download_page(self,url,**kwargs):         cookies = kwargs.pop("cookies","")         method = kwargs.pop("method","get")         if not isinstance(cookies,dict):             cookies = self.cookies(cookies)         if method == "get":             if cookies:                 resp = self.session.get(url,cookies=cookies,**kwargs)             else:                 resp = self.session.get(url, **kwargs)         elif method == "post":             if cookies:                 resp = self.session.post(url,cookies=cookies,**kwargs)             else:                 resp = self.session.post(url, **kwargs)         return resp       def _download_page(self,**kwargs):         print(kwargs)      def crawler(self,url):         pass      def extract_data(self,html):         pass      @property     def headers(self):         headers = {             "User-Agent": random.choice(self.ua)         }         return headers       def cookies(self,cookies):         cookie_dict = {i.split("=")[0]: i.split("=")[-1] for i in cookies.split("; ")}         return cookie_dict      def url(self,**kargs):         _url = kargs.pop("url","")         search_key = kargs.pop("search_key","")         if search_key:             search_key = parse.quote(search_key)             _url = _url.format(search_key)         return _url      def schedule(self):         pass      def re_exract(self, html_text,pattern,filename=None):         import re         if html_text:             html = html_text         elif filename:             with open(filename, 'r', encoding='utf-8') as f:                 html = f.readlines()             html = "".join(html)         pat = re.compile(pattern)         ret = pat.findall(html)         return ret      def save_to_file(self, text,filename):         with open(filename, 'w', encoding='utf-8') as f:             f.writelines(text)      def save_to_db(self,db_object):         pass      def exec_js(self,js_file_path):         with open(js_file_path, encoding='utf-8') as f:             content = f.read()         js = execjs.compile(content)         return js  if __name__ == '__main__':     sb = SpiderBase()     search_key = "test"     url = "url"     sb.url(url=url,search_key=search_key)     print(sb.setting.deny)

Setting.py

一些配置内容

ua = [     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",     "Opera/8.0 (Windows NT 5.1; U; en)",     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",     "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",     "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)" ]  deny=[]  MYSQL={     "host":"localhost",     "user":"root",     "password":"root",     "port":3306,     "db":"movie",     "charset":"utf8" }

  • 左青龙
  • 微信扫一扫
  • weinxin
  • 右白虎
  • 微信扫一扫
  • weinxin
admin
  • 本文由 发表于 2021年1月21日10:38:02
  • 转载请保留本文链接(CN-SEC中文网:感谢原作者辛苦付出):
                   SpiderBase类和Setting.pyhttps://cn-sec.com/archives/248345.html

发表评论

匿名网友 填写信息