python获取http代理

发布时间:July 24, 2016 // 分类:工作日志,运维工作,开发笔记,linux,windows,python // 7 Comments

主要是从http://www.ip181.com/ http://www.kuaidaili.com/以及http://www.66ip.com/获取相关的代理信息,并分别访问v2ex.com以及guokr.com以进行验证代理的可靠性。

# -*- coding=utf8 -*-
"""
    从网上爬取HTTPS代理
"""
import re
import sys
import time
import Queue
import logging
import requests
import threading
from pyquery import PyQuery
import requests.packages.urllib3
requests.packages.urllib3.disable_warnings()


#logging.basicConfig(
#    level=logging.DEBUG,
#    format="[%(asctime)s] %(levelname)s: %(message)s")

class Worker(threading.Thread):  # 处理工作请求
    def __init__(self, workQueue, resultQueue, **kwds):
        threading.Thread.__init__(self, **kwds)
        self.setDaemon(True)
        self.workQueue = workQueue
        self.resultQueue = resultQueue

    def run(self):
        while 1:
            try:
                callable, args, kwds = self.workQueue.get(False)  # get task
                res = callable(*args, **kwds)
                self.resultQueue.put(res)  # put result
            except Queue.Empty:
                break


class WorkManager:  # 线程池管理,创建
    def __init__(self, num_of_workers=10):
        self.workQueue = Queue.Queue()  # 请求队列
        self.resultQueue = Queue.Queue()  # 输出结果的队列
        self.workers = []
        self._recruitThreads(num_of_workers)

    def _recruitThreads(self, num_of_workers):
        for i in range(num_of_workers):
            worker = Worker(self.workQueue, self.resultQueue)  # 创建工作线程
            self.workers.append(worker)  # 加入到线程队列

    def start(self):
        for w in self.workers:
            w.start()

    def wait_for_complete(self):
        while len(self.workers):
            worker = self.workers.pop()  # 从池中取出一个线程处理请求
            worker.join()
            if worker.isAlive() and not self.workQueue.empty():
                self.workers.append(worker)  # 重新加入线程池中
        #logging.info('All jobs were complete.')

    def add_job(self, callable, *args, **kwds):
        self.workQueue.put((callable, args, kwds))  # 向工作队列中加入请求

    def get_result(self, *args, **kwds):
        return self.resultQueue.get(*args, **kwds)

def check_proxies(ip,port):
    """
    检测代理存活率
    分别访问v2ex.com以及guokr.com
    """
    proxies={'http': 'http://'+str(ip)+':'+str(port)}
    try:
        r0 = requests.get('http://v2ex.com', proxies=proxies,timeout=30,verify=False)
        r1 = requests.get('http://www.guokr.com', proxies=proxies,timeout=30,verify=False)

        if r0.status_code == requests.codes.ok and r1.status_code == requests.codes.ok and "09043258" in r1.content and "15015613" in r0.content:
            #r0.status_code == requests.codes.ok and r1.status_code == requests.codes.ok and 
            print ip,port
            return True
        else:
            return False

    except Exception, e:
        pass
        #sys.stderr.write(str(e))
        #sys.stderr.write(str(ip)+"\t"+str(port)+"\terror\r\n")
        return False

def get_ip181_proxies():
    """
    http://www.ip181.com/获取HTTP代理
    """
    proxy_list = []
    try:
        html_page = requests.get('http://www.ip181.com/',timeout=60,verify=False,allow_redirects=False).content.decode('gb2312')
        jq = PyQuery(html_page)
        for tr in jq("tr"):
            element = [PyQuery(td).text() for td in PyQuery(tr)("td")]
            if 'HTTP' not in element[3]:
                continue

            result = re.search(r'\d+\.\d+', element[4], re.UNICODE)
            if result and float(result.group()) > 5:
                continue
            #print element[0],element[1]
            proxy_list.append((element[0], element[1]))
    except Exception, e:
        sys.stderr.write(str(e))
        pass

    return proxy_list

def get_kuaidaili_proxies():
    """
    http://www.kuaidaili.com/获取HTTP代理
    """
    proxy_list = []
    for m in ['inha', 'intr', 'outha', 'outtr']:
        try:
            html_page = requests.get('http://www.kuaidaili.com/free/'+m,timeout=60,verify=False,allow_redirects=False).content.decode('utf-8')
            patterns = re.findall(r'(?P<ip>(?:\d{1,3}\.){3}\d{1,3})</td>\n?\s*<td.*?>\s*(?P<port>\d{1,4})',html_page)
            for element in patterns:
                #print element[0],element[1]
                proxy_list.append((element[0], element[1]))
        except Exception, e:
            sys.stderr.write(str(e))
            pass

    for n in range(0,11):
        try:
            html_page = requests.get('http://www.kuaidaili.com/proxylist/'+str(n)+'/',timeout=60,verify=False,allow_redirects=False).content.decode('utf-8')
            patterns = re.findall(r'(?P<ip>(?:\d{1,3}\.){3}\d{1,3})</td>\n?\s*<td.*?>\s*(?P<port>\d{1,4})',html_page)
            for element in patterns:
                #print element[0],element[1]
                proxy_list.append((element[0], element[1]))
        except Exception, e:
            sys.stderr.write(str(e))
            pass

    return proxy_list

def get_66ip_proxies():
    """
    http://www.66ip.com/ api接口获取HTTP代理
    """
    urllists = [
        'http://www.proxylists.net/http_highanon.txt',
        'http://www.proxylists.net/http.txt',
        'http://www.66ip.cn/nmtq.php?getnum=1000&anonymoustype=%s&proxytype=2&api=66ip',
        'http://www.66ip.cn/mo.php?sxb=&tqsl=100&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1'
        ]
    proxy_list = []
    for url in urllists:
        try:
            html_page = requests.get(url,timeout=60,verify=False,allow_redirects=False).content.decode('gb2312')
            patterns = re.findall(r'((?:\d{1,3}\.){1,3}\d{1,3}):([1-9]\d*)',html_page)
            for element in patterns:
                #print element[0],element[1]
                proxy_list.append((element[0], element[1]))
        except Exception, e:
            sys.stderr.write(str(e))
            pass

    return proxy_list


def get_proxy_sites():
    wm = WorkManager(20)
    proxysites = []
    proxysites.extend(get_ip181_proxies())
    proxysites.extend(get_kuaidaili_proxies())
    proxysites.extend(get_66ip_proxies())

    for element in proxysites:
        wm.add_job(check_proxies,str(element[0]),str(element[1]))
    wm.start()
    wm.wait_for_complete()


if __name__ == '__main__':
    try:
        get_proxy_sites()
    except Exception as exc:
        print(exc)

标签:http代理, proxies

已有 7 条 关于 " python获取http代理 "的评论.

  1. 你好 你好

    问下你用的是python几版本?测试不通过。谢谢了

  2. 你好 你好

    问下你用的是python几版本?测试不通过。谢谢了

    1. 默认2.7.10.你把你的错误信息贴出来看看

      1. 你好 你好

        No module named packages.urllib3
        我试着pip install urllib3,但已经装了也是错,是导入错误吗?
        还有最后的print(exc)这不是3的写法吗?

      2. 你好 你好

        No module named packages.urllib3
        我试着pip install urllib3,但已经装了也是错,是导入错误吗?
        还有最后的print(exc)这不是3的写法吗?

        1. 这个是requests的包吧~……^ - ^

  3. reber reber

    表哥,为什么WorkManager中新建函数wait_for_complete,而不是直接在start函数中使用queue.join()来等待列队完成?

添加新评论 »

分类
最新文章
最近回复
  • 没穿底裤: hook execve函数
  • tuhao lam: 大佬,还有持续跟进Linux命令执行记录这块吗?通过内核拦截exec系统调用的方式,目前有没有...
  • 纸条: 谢谢,大佬。我要拿去抄作业了!
  • 没穿底裤: 稳定运行了大半年
  • ppx: 兄弟做得咋样了?