最近在弄点蛋疼的东西.爬虫,扫描。扫描交给sqlmapapi来进行.现在的资料不是很多,但是还是可以找到一些
《使用sqlmapapi.py批量化扫描实践》http://drops.wooyun.org/tips/6653
看看他所封装的sqlmapapi的类
#!/usr/bin/python #-*-coding:utf-8-*- import requests import time import json class AutoSqli(object): """ 使用sqlmapapi的方法进行与sqlmapapi建立的server进行交互 By Manning """ def __init__(self, server='', target='',data = '',referer = '',cookie = ''): super(AutoSqli, self).__init__() self.server = server if self.server[-1] != '/': self.server = self.server + '/' self.target = target self.taskid = '' self.engineid = '' self.status = '' self.data = data self.referer = referer self.cookie = cookie self.start_time = time.time() #新建扫描任务 def task_new(self): self.taskid = json.loads( requests.get(self.server + 'task/new').text)['taskid'] print 'Created new task: ' + self.taskid #得到taskid,根据这个taskid来进行其他的 if len(self.taskid) > 0: return True return False #删除扫描任务 def task_delete(self): if json.loads(requests.get(self.server + 'task/' + self.taskid + '/delete').text)['success']: print '[%s] Deleted task' % (self.taskid) return True return False #扫描任务开始 def scan_start(self): headers = {'Content-Type': 'application/json'} #需要扫描的地址 payload = {'url': self.target} url = self.server + 'scan/' + self.taskid + '/start' #http://127.0.0.1:8557/scan/xxxxxxxxxx/start t = json.loads( requests.post(url, data=json.dumps(payload), headers=headers).text) self.engineid = t['engineid'] if len(str(self.engineid)) > 0 and t['success']: print 'Started scan' return True return False #扫描任务的状态 def scan_status(self): self.status = json.loads( requests.get(self.server + 'scan/' + self.taskid + '/status').text)['status'] if self.status == 'running': return 'running' elif self.status == 'terminated': return 'terminated' else: return 'error' #扫描任务的细节 def scan_data(self): self.data = json.loads( requests.get(self.server + 'scan/' + self.taskid + '/data').text)['data'] if len(self.data) == 0: print 'not injection:\t' else: print 'injection:\t' + self.target #扫描的设置,主要的是参数的设置 def option_set(self): headers = {'Content-Type': 'application/json'} option = {"options": { "smart": True, ... } } url = self.server + 'option/' + self.taskid + '/set' t = json.loads( requests.post(url, data=json.dumps(option), headers=headers).text) print t #停止扫描任务 def scan_stop(self): json.loads( requests.get(self.server + 'scan/' + self.taskid + '/stop').text)['success'] #杀死扫描任务进程 def scan_kill(self): json.loads( requests.get(self.server + 'scan/' + self.taskid + '/kill').text)['success'] def run(self): if not self.task_new(): return False self.option_set() if not self.scan_start(): return False while True: if self.scan_status() == 'running': time.sleep(10) elif self.scan_status() == 'terminated': break else: break print time.time() - self.start_time if time.time() - self.start_time > 3000: error = True self.scan_stop() self.scan_kill() break self.scan_data() self.task_delete() print time.time() - self.start_time if __name__ == '__main__': t = AutoSqli('http://127.0.0.1:8774', 'http://192.168.3.171/1.php?id=1') t.run()
它的工作过程是
get请求创建任务, 获取到任务id
get请求特定的任务id设置参数
post请求特定的任务id开始扫描指定url
get请求特定的任务id获取状态
get请求特定的任务id获取测试结果
get请求特定的任务id删除任务
进入到lib/utils/api.py的server类,可以发现通过向server提交数据进行与服务的交互。 一共分为3种类型。
Users' methods 用户方法
Admin function 管理函数
sqlmap core interact functions 核心交互函数
可以提交数据的种类如下。
用户方法
@get("/task/new")
@get("/task//delete")
管理函数
@get("/admin//list")
@get("/admin//flush")
核心交互函数
@get("/option//list")
@post("/option//get")
@post("/option//set")
@post("/scan//start")
@get("/scan//stop")
@get("/scan//kill")
@get("/scan//status")
@get("/scan//data")
@get("/scan//log//")
@get("/scan//log")
@get("/download///")
最后对于是否是有注入漏洞, 代码里面是这么判断的, 如果返回的字典中, data里面有值, 那么就有注入
然后从https://github.com/smarttang/w3a_Scan_Console/blob/master/module/sprider_module.py里面得到爬虫模块.稍微整合一下
#!/usr/bin/python # vim: set fileencoding=utf-8: import sys import urllib2 import re from BeautifulSoup import BeautifulSoup import autosql class SpriderUrl: # 初始化 def __init__(self,url): self.url=url #self.con=Db_Connector('sprider.ini') #获得目标url的第一次url清单 def get_self(self): urls=[] try: body_text=urllib2.urlopen(self.url).read() except: print "[*] Web Get Error:checking the Url" soup=BeautifulSoup(body_text) links=soup.findAll('a') for link in links: # 获得了目标的url但还需要处理 _url=link.get('href') # 接着对其进行判断处理 # 先判断它是否是无意义字符开头以及是否为None值 # 判断URL后缀,不是列表的不抓取 if re.match('^(javascript|:;|#)',_url) or _url is None or re.match('.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|db)$',_url): continue # 然后判断它是不是http|https开头,对于这些开头的都要判断是否是本站点, 不做超出站点的爬虫 if re.match('^(http|https)',_url): if not re.match('^'+self.url,_url): continue else: urls.append(_url) else: urls.append(self.url+_url) rst=list(set(urls)) for rurl in rst: try: self.sprider_self_all(rurl) # 进行递归,但是缺点太明显了,会对全部的页面进行重复递归。然后递交进入autosql # AutoSqli('http://127.0.0.1:8775', rurl).run except: print "spider error" def sprider_self_all(self,domain): urls=[] try: body_text=urllib2.urlopen(domain).read() except: print "[*] Web Get Error:checking the Url" sys.exit(0) soup=BeautifulSoup(body_text) links=soup.findAll('a') for link in links: # 获得了目标的url但还需要处理 _url=link.get('href') # 接着对其进行判断处理 # 先判断它是否是无意义字符开头以及是否为None值 # 判断URL后缀,不是列表的不抓取 try: if re.match('^(javascript|:;|#)',str(_url)) or str(_url) is None or re.match('.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|db)$',str(_url)): continue except TypeError: print "[*] Type is Error! :"+str(_url) continue # 然后判断它是不是http|https开头,对于这些开头的都要判断是否是本站点, 不做超出站点的爬虫 if re.match('^(http|https)',_url): if not re.match('^'+self.url,_url): continue else: urls.append(_url) else: urls.append(self.url+_url) res=list(set(urls)) for rurl in res: try: print rurl #AutoSqli('http://127.0.0.1:8775', rurl).run except: print "spider error" spi="http://0day5.com/" t=SpriderUrl(spi) # # 第一次捕获 t.get_self()
最好的办法还是存进数据库里面,然后检查是否重复。
for rurl in res: if self.con.find_item("select * from url_sprider where url='"+rurl+"' and domain='"+self.url+"'"): continue else: try: self.con.insert_item("insert into url_sprider(url,tag,domain)values('"+rurl+"',0,'"+self.url+"')") except: print "[*] insert into is Error!"
最近依旧在整理这爬虫的资料:
1.针对很多爬虫有明显特征的办法就是指定相对应的User-Agent
2.针对部分WAF则可以采取来路来进行Bypass。如果是百度的呢
改进了下
USER_AGENTS = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", ] REFERERS = [ "https://www.baidu.com", "http://www.baidu.com", "https://www.google.com.hk", "http://www.so.com", "http://www.sogou.com", "http://www.soso.com", "http://www.bing.com", ] default_cookies = {} #随机User-Agent. default_headers = { 'User-Agent': random.choice(USER_AGENTS), 'Accept': 'Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Cache-Control': 'max-age=0', 'referer': random.choice(REFERERS), 'Accept-Charset': 'GBK,utf-8;q=0.7,*;q=0.3', }
然后依旧是过滤的问题,就是相似度检查~然后有效的爬虫,这样得到的结果可以更加精准一些。这个算法主要是依靠对URL的拆解与对拆解对象的HASH,这个算法适用类似的需求常见。这个算法是将一个URL拆解为三个维度,第一个维度是netloc,第二个维度是path的各项长度,第三个维度是query对象的参数排序后列表。通过一个数据结构对以上三个维度组合,构建一个可hash的对象。[来自:http://drops.wooyun.org/tips/5462]
#!/usr/bin/env python # coding:utf-8 import time import os import urlparse import hashlib import sys sys.path.append("..") from config.config import * reload(sys) sys.setdefaultencoding("utf-8") SIMILAR_SET = set() REPEAT_SET = set() ''' 2015.3.30 分清楚爬虫 什么是聚焦 什么是过滤 聚焦: 如果keyword在url则返回True 否则返回False 过滤: 如果keyword在url则返回False 否则返回True ''' def format(url): ''' 策略是构建一个三元组 第一项为url的netloc 第二项为path中每项的拆分长度 第三项为query的每个参数名称(参数按照字母顺序排序,避免由于顺序不同而导致的重复问题) ''' if urlparse.urlparse(url)[2] == '': url = url+'/' url_structure = urlparse.urlparse(url) netloc = url_structure[1] path = url_structure[2] query = url_structure[4] temp = (netloc,tuple([len(i) for i in path.split('/')]),tuple(sorted([i.split('=')[0] for i in query.split('&')]))) #print temp return temp def check_netloc_is_ip(netloc): ''' 如果url的netloc为ip形式 return True 否则 return False ''' flag =0 t = netloc.split('.') for i in t: try: int(i) flag += 1 except Exception, e: break if flag == 4: return True return False def url_domain_control(url,keyword): ''' URL域名控制 聚焦 True url符合域名判断 False url不符合域名判断 1,keyword可以是list或者str 2,如果url的netloc为ip形式,return True ''' t = format(url) if check_netloc_is_ip(t[0]): return True elif str(type(keyword)) == "<type 'list'>": for i in keyword: if i.lower() in t[0].lower(): return True elif str(type(keyword)) == "<type 'str'>": if keyword.lower() in t[0].lower(): return True return False def url_domain_control_ignore(url,keyword): ''' URL域名控制 过滤 True 忽略关键字不在url中 False 忽略关键字在url中 例如: 忽略blog,如果域名的netloc内有blog,则返回false ''' t = format(url) for i in keyword: if i in t[0].lower(): return False return True def url_similar_control(url): ''' URL相似性控制 True url未重复 False url重复 ''' t = format(url) if t not in SIMILAR_SET: SIMILAR_SET.add(t) return True return False def url_format_control(url): ''' URL格式控制 过滤 True url符合格式判断 False url不符合格式判断 ''' if '}' not in url and '404' not in url and url[0].lower() == 'h' and '/////' not in url and len(format(url)[1]) < 6: if len(format(url)[2]) > 0: for i in format(url)[2]: if len(i) > 20: return False if 'viewthread' in url or 'forumdisplay' in url: return False return True return False def url_custom_control(url): ''' URL自定义关键字控制 过滤 True False ''' for i in CUSTOM_KEY: if i in url: return False return True def url_custom_focus_control(url,focuskey): ''' URL自定义关键字控制 聚焦 True 符合聚焦策略 False ''' if len(focuskey) == 0: return True for i in focuskey: if i in url: return True return False def url_repeat_control(url): ''' URL重复控制 True url未重复 False url重复 ''' if url not in REPEAT_SET: REPEAT_SET.add(url) return True return False def url_filter_similarity(url,keyword,ignore_keyword,focuskey): if url_format_control(url) and url_similar_control(url) \ and url_domain_control(url,keyword) and url_domain_control_ignore(url,IGNORE_KEY_WORD) \ and url_custom_control(url) and url_custom_focus_control(url,focuskey): return True else: return False def url_filter_no_similarity(url,keyword,ignore_keyword,focuskey): if url_format_control(url) and url_repeat_control(url) \ and url_domain_control(url,keyword) and url_domain_control_ignore(url,IGNORE_KEY_WORD) \ and url_custom_control(url) and url_custom_focus_control(url,focuskey): return True else: return False if __name__ == "__main__": print url_format_control("http://www.gznu.edu.cn")