最近在弄点蛋疼的东西.爬虫,扫描。扫描交给sqlmapapi来进行.现在的资料不是很多,但是还是可以找到一些
《使用sqlmapapi.py批量化扫描实践》http://drops.wooyun.org/tips/6653
看看他所封装的sqlmapapi的类
#!/usr/bin/python
#-*-coding:utf-8-*-
import requests
import time
import json
class AutoSqli(object):
"""
使用sqlmapapi的方法进行与sqlmapapi建立的server进行交互
By Manning
"""
def __init__(self, server='', target='',data = '',referer = '',cookie = ''):
super(AutoSqli, self).__init__()
self.server = server
if self.server[-1] != '/':
self.server = self.server + '/'
self.target = target
self.taskid = ''
self.engineid = ''
self.status = ''
self.data = data
self.referer = referer
self.cookie = cookie
self.start_time = time.time()
#新建扫描任务
def task_new(self):
self.taskid = json.loads(
requests.get(self.server + 'task/new').text)['taskid']
print 'Created new task: ' + self.taskid
#得到taskid,根据这个taskid来进行其他的
if len(self.taskid) > 0:
return True
return False
#删除扫描任务
def task_delete(self):
if json.loads(requests.get(self.server + 'task/' + self.taskid + '/delete').text)['success']:
print '[%s] Deleted task' % (self.taskid)
return True
return False
#扫描任务开始
def scan_start(self):
headers = {'Content-Type': 'application/json'}
#需要扫描的地址
payload = {'url': self.target}
url = self.server + 'scan/' + self.taskid + '/start'
#http://127.0.0.1:8557/scan/xxxxxxxxxx/start
t = json.loads(
requests.post(url, data=json.dumps(payload), headers=headers).text)
self.engineid = t['engineid']
if len(str(self.engineid)) > 0 and t['success']:
print 'Started scan'
return True
return False
#扫描任务的状态
def scan_status(self):
self.status = json.loads(
requests.get(self.server + 'scan/' + self.taskid + '/status').text)['status']
if self.status == 'running':
return 'running'
elif self.status == 'terminated':
return 'terminated'
else:
return 'error'
#扫描任务的细节
def scan_data(self):
self.data = json.loads(
requests.get(self.server + 'scan/' + self.taskid + '/data').text)['data']
if len(self.data) == 0:
print 'not injection:\t'
else:
print 'injection:\t' + self.target
#扫描的设置,主要的是参数的设置
def option_set(self):
headers = {'Content-Type': 'application/json'}
option = {"options": {
"smart": True,
...
}
}
url = self.server + 'option/' + self.taskid + '/set'
t = json.loads(
requests.post(url, data=json.dumps(option), headers=headers).text)
print t
#停止扫描任务
def scan_stop(self):
json.loads(
requests.get(self.server + 'scan/' + self.taskid + '/stop').text)['success']
#杀死扫描任务进程
def scan_kill(self):
json.loads(
requests.get(self.server + 'scan/' + self.taskid + '/kill').text)['success']
def run(self):
if not self.task_new():
return False
self.option_set()
if not self.scan_start():
return False
while True:
if self.scan_status() == 'running':
time.sleep(10)
elif self.scan_status() == 'terminated':
break
else:
break
print time.time() - self.start_time
if time.time() - self.start_time > 3000:
error = True
self.scan_stop()
self.scan_kill()
break
self.scan_data()
self.task_delete()
print time.time() - self.start_time
if __name__ == '__main__':
t = AutoSqli('http://127.0.0.1:8774', 'http://192.168.3.171/1.php?id=1')
t.run()
它的工作过程是
get请求创建任务, 获取到任务id
get请求特定的任务id设置参数
post请求特定的任务id开始扫描指定url
get请求特定的任务id获取状态
get请求特定的任务id获取测试结果
get请求特定的任务id删除任务
进入到lib/utils/api.py的server类,可以发现通过向server提交数据进行与服务的交互。 一共分为3种类型。
Users' methods 用户方法
Admin function 管理函数
sqlmap core interact functions 核心交互函数
可以提交数据的种类如下。
用户方法
@get("/task/new")
@get("/task//delete")
管理函数
@get("/admin//list")
@get("/admin//flush")
核心交互函数
@get("/option//list")
@post("/option//get")
@post("/option//set")
@post("/scan//start")
@get("/scan//stop")
@get("/scan//kill")
@get("/scan//status")
@get("/scan//data")
@get("/scan//log//")
@get("/scan//log")
@get("/download///")
最后对于是否是有注入漏洞, 代码里面是这么判断的, 如果返回的字典中, data里面有值, 那么就有注入
然后从https://github.com/smarttang/w3a_Scan_Console/blob/master/module/sprider_module.py里面得到爬虫模块.稍微整合一下
#!/usr/bin/python
# vim: set fileencoding=utf-8:
import sys
import urllib2
import re
from BeautifulSoup import BeautifulSoup
import autosql
class SpriderUrl:
# 初始化
def __init__(self,url):
self.url=url
#self.con=Db_Connector('sprider.ini')
#获得目标url的第一次url清单
def get_self(self):
urls=[]
try:
body_text=urllib2.urlopen(self.url).read()
except:
print "[*] Web Get Error:checking the Url"
soup=BeautifulSoup(body_text)
links=soup.findAll('a')
for link in links:
# 获得了目标的url但还需要处理
_url=link.get('href')
# 接着对其进行判断处理
# 先判断它是否是无意义字符开头以及是否为None值
# 判断URL后缀,不是列表的不抓取
if re.match('^(javascript|:;|#)',_url) or _url is None or re.match('.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|db)$',_url):
continue
# 然后判断它是不是http|https开头,对于这些开头的都要判断是否是本站点, 不做超出站点的爬虫
if re.match('^(http|https)',_url):
if not re.match('^'+self.url,_url):
continue
else:
urls.append(_url)
else:
urls.append(self.url+_url)
rst=list(set(urls))
for rurl in rst:
try:
self.sprider_self_all(rurl)
# 进行递归,但是缺点太明显了,会对全部的页面进行重复递归。然后递交进入autosql
# AutoSqli('http://127.0.0.1:8775', rurl).run
except:
print "spider error"
def sprider_self_all(self,domain):
urls=[]
try:
body_text=urllib2.urlopen(domain).read()
except:
print "[*] Web Get Error:checking the Url"
sys.exit(0)
soup=BeautifulSoup(body_text)
links=soup.findAll('a')
for link in links:
# 获得了目标的url但还需要处理
_url=link.get('href')
# 接着对其进行判断处理
# 先判断它是否是无意义字符开头以及是否为None值
# 判断URL后缀,不是列表的不抓取
try:
if re.match('^(javascript|:;|#)',str(_url)) or str(_url) is None or re.match('.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|db)$',str(_url)):
continue
except TypeError:
print "[*] Type is Error! :"+str(_url)
continue
# 然后判断它是不是http|https开头,对于这些开头的都要判断是否是本站点, 不做超出站点的爬虫
if re.match('^(http|https)',_url):
if not re.match('^'+self.url,_url):
continue
else:
urls.append(_url)
else:
urls.append(self.url+_url)
res=list(set(urls))
for rurl in res:
try:
print rurl
#AutoSqli('http://127.0.0.1:8775', rurl).run
except:
print "spider error"
spi="http://0day5.com/"
t=SpriderUrl(spi)
# # 第一次捕获
t.get_self()
最好的办法还是存进数据库里面,然后检查是否重复。
for rurl in res:
if self.con.find_item("select * from url_sprider where url='"+rurl+"' and domain='"+self.url+"'"):
continue
else:
try:
self.con.insert_item("insert into url_sprider(url,tag,domain)values('"+rurl+"',0,'"+self.url+"')")
except:
print "[*] insert into is Error!"
最近依旧在整理这爬虫的资料:
1.针对很多爬虫有明显特征的办法就是指定相对应的User-Agent
2.针对部分WAF则可以采取来路来进行Bypass。如果是百度的呢
改进了下
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
REFERERS = [
"https://www.baidu.com",
"http://www.baidu.com",
"https://www.google.com.hk",
"http://www.so.com",
"http://www.sogou.com",
"http://www.soso.com",
"http://www.bing.com",
]
default_cookies = {}
#随机User-Agent.
default_headers = {
'User-Agent': random.choice(USER_AGENTS),
'Accept': 'Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Cache-Control': 'max-age=0',
'referer': random.choice(REFERERS),
'Accept-Charset': 'GBK,utf-8;q=0.7,*;q=0.3',
}
然后依旧是过滤的问题,就是相似度检查~然后有效的爬虫,这样得到的结果可以更加精准一些。这个算法主要是依靠对URL的拆解与对拆解对象的HASH,这个算法适用类似的需求常见。这个算法是将一个URL拆解为三个维度,第一个维度是netloc,第二个维度是path的各项长度,第三个维度是query对象的参数排序后列表。通过一个数据结构对以上三个维度组合,构建一个可hash的对象。[来自:http://drops.wooyun.org/tips/5462]
#!/usr/bin/env python
# coding:utf-8
import time
import os
import urlparse
import hashlib
import sys
sys.path.append("..")
from config.config import *
reload(sys)
sys.setdefaultencoding("utf-8")
SIMILAR_SET = set()
REPEAT_SET = set()
'''
2015.3.30
分清楚爬虫 什么是聚焦 什么是过滤
聚焦: 如果keyword在url则返回True 否则返回False
过滤: 如果keyword在url则返回False 否则返回True
'''
def format(url):
'''
策略是构建一个三元组
第一项为url的netloc
第二项为path中每项的拆分长度
第三项为query的每个参数名称(参数按照字母顺序排序,避免由于顺序不同而导致的重复问题)
'''
if urlparse.urlparse(url)[2] == '':
url = url+'/'
url_structure = urlparse.urlparse(url)
netloc = url_structure[1]
path = url_structure[2]
query = url_structure[4]
temp = (netloc,tuple([len(i) for i in path.split('/')]),tuple(sorted([i.split('=')[0] for i in query.split('&')])))
#print temp
return temp
def check_netloc_is_ip(netloc):
'''
如果url的netloc为ip形式
return True
否则
return False
'''
flag =0
t = netloc.split('.')
for i in t:
try:
int(i)
flag += 1
except Exception, e:
break
if flag == 4:
return True
return False
def url_domain_control(url,keyword):
'''
URL域名控制 聚焦
True url符合域名判断
False url不符合域名判断
1,keyword可以是list或者str
2,如果url的netloc为ip形式,return True
'''
t = format(url)
if check_netloc_is_ip(t[0]):
return True
elif str(type(keyword)) == "<type 'list'>":
for i in keyword:
if i.lower() in t[0].lower():
return True
elif str(type(keyword)) == "<type 'str'>":
if keyword.lower() in t[0].lower():
return True
return False
def url_domain_control_ignore(url,keyword):
'''
URL域名控制 过滤
True 忽略关键字不在url中
False 忽略关键字在url中
例如:
忽略blog,如果域名的netloc内有blog,则返回false
'''
t = format(url)
for i in keyword:
if i in t[0].lower():
return False
return True
def url_similar_control(url):
'''
URL相似性控制
True url未重复
False url重复
'''
t = format(url)
if t not in SIMILAR_SET:
SIMILAR_SET.add(t)
return True
return False
def url_format_control(url):
'''
URL格式控制 过滤
True url符合格式判断
False url不符合格式判断
'''
if '}' not in url and '404' not in url and url[0].lower() == 'h' and '/////' not in url and len(format(url)[1]) < 6:
if len(format(url)[2]) > 0:
for i in format(url)[2]:
if len(i) > 20:
return False
if 'viewthread' in url or 'forumdisplay' in url:
return False
return True
return False
def url_custom_control(url):
'''
URL自定义关键字控制 过滤
True
False
'''
for i in CUSTOM_KEY:
if i in url:
return False
return True
def url_custom_focus_control(url,focuskey):
'''
URL自定义关键字控制 聚焦
True 符合聚焦策略
False
'''
if len(focuskey) == 0:
return True
for i in focuskey:
if i in url:
return True
return False
def url_repeat_control(url):
'''
URL重复控制
True url未重复
False url重复
'''
if url not in REPEAT_SET:
REPEAT_SET.add(url)
return True
return False
def url_filter_similarity(url,keyword,ignore_keyword,focuskey):
if url_format_control(url) and url_similar_control(url) \
and url_domain_control(url,keyword) and url_domain_control_ignore(url,IGNORE_KEY_WORD) \
and url_custom_control(url) and url_custom_focus_control(url,focuskey):
return True
else:
return False
def url_filter_no_similarity(url,keyword,ignore_keyword,focuskey):
if url_format_control(url) and url_repeat_control(url) \
and url_domain_control(url,keyword) and url_domain_control_ignore(url,IGNORE_KEY_WORD) \
and url_custom_control(url) and url_custom_focus_control(url,focuskey):
return True
else:
return False
if __name__ == "__main__":
print url_format_control("http://www.gznu.edu.cn")