内容纲要

说明

Github仓库访问,请多多star!

  • 从github拉取proxylist,本地验活后保存为txt文档
  • 使用多线程加快速度
  • type可选
  • 自动去重
  • 原始地址访问失败后尝试自动更换镜像地址
  • 验证方法:requests.get(url="http://icanhazip.com/", proxies=Proxy)
  • 验证方法:采用双重验证telnet+req获取高质量代理

添加源

注意

对于文档的要求:

  1. 一行一条,可以有无效行
  2. 需要utf-8编码才能使通用提取函数生效(大概)

模板

添加于代码中函数定义前

# 获取代理信息的格式模板
# 文档url+模式+匹配ip规则+匹配端口规则+匹配模式规则
# 当type不固定时对应"mix"
# 默认使用的telnet方法,有需求可以自行修改checkProxy()

# raw2_http
# 62.171.170.82:3128
raw = {"url": 'https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt',
        "type": "http",
        "findIp": re.compile(r'(\d+?\.\d+?\.\d+?\.\d+?):'),
        "findPort": re.compile(r':(\d+?)\n'),
        "findType": ""}
raws.append(raw)

免责

本项目因为兴趣而生,开源仅供测试。

当前版本源码

# -*- codeing = utf-8 -*-
# @Time : 2021/1/26 22:25
# @Author : 96bearli
# @File : main.py
# @Software : PyCharm

'''
思路       从github拉取proxylist,本地验活后保存为txt文档
文档包含    http/https,ip:port,ping
验证方法    requests.get(url="http://icanhazip.com/", proxies=Proxy)>划掉但仍可选,并发应减少到个位
          默认 telnet验证
使用多线程加快速度
type可选 在checkProxy()    if "http" in type:  处修改
即将增加的功能 去重>已增加
更改:采用双重验证方法获取高质量稳定代理(比如2500筛选100)
'''
import telnetlib
import requests
import re
import queue
import threading

# 获取代理信息的格式模板
# 文档要求: 1.一行一条,可以有无效行
#          2.需要utf-8编码才能使通用提取函数生效(大概)
# 文档url+模式+匹配ip规则+匹配端口规则+匹配模式规则
# 当type不固定时对应"mix",且默认使用的的telnet不支持ss4/ss5
# 有需求可以自行修改checkProxy()
# raw = {"url": 'https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list',
#        "type": "mix",
#        "findIp": re.compile(r'"host": "(.+?)"'),
#        "findPort": re.compile(r'"port": (\d+?),'),
#        "findType": re.compile(r'"type": "(.+?)"')}
# raws.append(raw)
raws = []
# raw1_mix
# {"host": "3.211.65.185", "from": "proxylist", "country": "US", "port": 80, "response_time": 0.82, "export_address": ["35.193.184.18", "192.168.99.102", "54.226.33.176"], "type": "http", "anonymity": "high_anonymous"}
raw = {"url": 'https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list',
       "type": "mix",
       "findIp": re.compile(r'"host": "(.+?)"'),
       "findPort": re.compile(r'"port": (\d+?),'),
       "findType": re.compile(r'"type": "(.+?)"')}
raws.append(raw)
# raw2_http
# 62.171.170.82:3128
raw = {"url": 'https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt',
       "type": "http",
       "findIp": re.compile(r'(\d+?\.\d+?\.\d+?\.\d+?):'),
       "findPort": re.compile(r':(\d+?)\n'),
       "findType": ""}
raws.append(raw)
# raw3_http
# 62.171.170.82:3128
raw = {"url": 'https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txt',
       "type": "http",
       "findIp": re.compile(r'(\d+?\.\d+?\.\d+?\.\d+?):'),
       "findPort": re.compile(r':(\d+?)\n'),
       "findType": ""}
raws.append(raw)
# raw4_http
# 178.212.54.137:8080
raw = {"url": 'https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/http.txt',
       "type": "http",
       "findIp": re.compile(r'(\d+?\.\d+?\.\d+?\.\d+?):'),
       "findPort": re.compile(r':(\d+?)\n'),
       "findType": ""}
raws.append(raw)
# raw5_https
# 103.21.161.105:6667
raw = {"url": 'https://raw.githubusercontent.com/hookzof/socks5_list/master/proxy.txt',
       "type": "https",
       "findIp": re.compile(r'(\d+?\.\d+?\.\d+?\.\d+?):'),
       "findPort": re.compile(r':(\d+?)\r\n'),
       "findType": ""}
raws.append(raw)
# raw6_HTTP/HTTPS
# 203.202.245.58:80
raw = {"url": 'https://sunny9577.github.io/proxy-scraper/proxies.txt',
       "type": "https",
       "findIp": re.compile(r'(\d+?\.\d+?\.\d+?\.\d+?):'),
       "findPort": re.compile(r':(\d+?)\n'),
       "findType": ""}
raws.append(raw)

# 镜像,按需替换链接
# 内地cdn
# proxyListUrl = 'https://raw.sevencdn.com/fate0/proxylist/master/proxy.list'
# proxyListUrl = 'https://cdn.jsdelivr.net/gh/fate0/proxylist@master/proxy.list'
# 香港cdn
# proxyListUrl = 'https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list'

def checkProxy():
    # 当队列不为空时
    while not checkDataQueue.empty():
        # 从队列读取一条数据
        # 读取是阻塞操作
        proxies = checkDataQueue.get()
        ip = proxies[0]
        port = proxies[1]
        type = proxies[2]
        Proxy = type + '://' + ip + ':' + port
        print(Proxy)
        try:
            # 尝试使用telnet和req结合的双重验证
            telnetlib.Telnet(ip, port=port, timeout=3)
            # 在这里选择type,in筛选http/https。not in 或其他
            # if "http" in type:
            if type == "http":
                rep = requests.get(url="http://icanhazip.com/", timeout=3, proxies={type: Proxy})
                repIp = str(rep.text.replace("\n", ""))
                if repIp in Proxy:
                    print("双重验证后找到一条较高质量的代理,%s" % Proxy)
                    # 申请获取锁,此过程为阻塞等待状态,直到获取锁完毕
                    mutex_lock.acquire()
                    with open('./proxy.txt', 'a+', encoding='utf-8') as file:
                        file.write(Proxy+'\n')
                    mutex_lock.release()

            # 纯requests方法
            # rep = requests.get(url="http://icanhazip.com/", timeout=3, proxies={type: Proxy})
            # repIp = rep.text.replace("\n", "")
            # if repIp == ip:
            #     if type == "http":
            #         print("找到一条有效代理 %s" % Proxy)
            #         # 申请获取锁,此过程为阻塞等待状态,直到获取锁完毕
            #         mutex_lock.acquire()
            #         with open('./proxy.txt', 'a+', encoding='utf-8') as f:
            #             f.write(Proxy + "\n")
            #         mutex_lock.release()
            #     else:
            #         print("无效")

            # 纯telnet的方式
            # telnetlib.Telnet(ip, port=port, timeout=3)
            # # 在这里选择type,in筛选http/https。not in 其他,并自行修改检查方式为requests
            # # if "http" in type:
            # if type == "http":
            #     print("找到一条有效代理 %s" % Proxy)
            #     # 申请获取锁,此过程为阻塞等待状态,直到获取锁完毕
            #     mutex_lock.acquire()
            #     with open('./proxy.txt', 'a+', encoding='utf-8') as f:
            #         f.write(Proxy + "\n")
            #     mutex_lock.release()
            # else:
            #     print("无效")

        except Exception as proxyError:
            print(proxyError)
            print("无效")

def reWriteList(aList):
    # 刚开始想直接list(set(list)),报错列表不能哈希,查了一下二维列表需要先转元组
    aList = list(set([tuple(t) for t in aList]))
    return aList

def getProxyList():
    allList = []
    for raw in raws:
        print("-" * 20)
        print("正在尝试获取源raw%i提供的列表" % (raws.index(raw) + 1))
        try:
            response = requests.get(raw["url"], timeout=3)
            print(response.content)
        except Exception as rawError:
            print(rawError)
            try:  # 尝试自动更换镜像
                response = requests.get(raw["url"].replace("raw.githubusercontent.com", "raw.sevencdn.com"), timeout=3)
                print(response.content)
                print("raw%i原始地址尝试失败,自动更换镜像地址成功" % (raws.index(raw) + 1))
            except Exception as rawError2:
                print(rawError2)
                input("raw%i_url已失效,请检查,可尝试自行更换,按回车继续" % (raws.index(raw) + 1))
                continue
        # 分割双保险
        try:
            print("尝试.split('\\n')分割")
            # \n分割
            theList = response.content.split('\n')
        except:
            # 用re正则提取单条数据
            print("尝试正则分割")
            findLine = re.compile(r".+?\n")
            theList = re.findall(findLine, response.text)
        # 提取数据
        if raw["type"] != "mix":
            for line in theList:
                line = str(line)
                try:
                    ip = re.findall(raw["findIp"], line)[0]
                    port = re.findall(raw["findPort"], line)[0]
                    type = raw["type"]
                    aList = [ip, port, type]
                    allList.append(aList)
                except Exception as reError:
                    print(reError)
                    continue
        else:
            for line in theList:
                line = str(line)
                try:
                    ip = re.findall(raw["findIp"], line)[0]
                    port = re.findall(raw["findPort"], line)[0]
                    type = re.findall(raw["findType"], line)[0]
                    aList = [ip, port, type]
                    allList.append(aList)
                except Exception as reError:
                    print(reError)
                    continue
    # 去重+统计
    from time import sleep
    before = len(allList)
    if before == 0:
        print("还是好好检查一下源的配置吧")
        exit()
    allList = reWriteList(allList)
    after = len(allList)
    print("-" * 20)
    print("本次运行获取代理数据%i条,去重复后剩余%i条" % (before, after))
    print("3秒后自动开始多进程验活")
    print("-" * 20)
    sleep(3)
    # 测试点
    # with open("./cache", "w") as f:
    #     for all in allList:
    #         for a in all:
    #             f.write(a)
    #         f.write("\n")
    return allList

if __name__ == '__main__':
    # 先拿到要处理的代理数据列表
    proxiesList = getProxyList()
    # 创建空文件./proxy.txt
    with open('./proxy.txt', 'w', encoding='utf-8') as f:
        pass
    # 将所有代理数据列表放入先进先出FIFO队列中
    # 队列的写入和读取都是阻塞的,故在多线程情况下不会乱
    # 在不使用框架的前提下,引入多线程,提高爬取效率
    # 创建一个队列
    checkDataQueue = queue.Queue(len(proxiesList))
    # 写入代理数据到队列
    for proxy in proxiesList:
        # proxiesList[i]也是list类型,分别存入队列
        checkDataQueue.put(proxy)
    proxiesList = []
    # 创建一个线程锁,防止多线程写入文件时发生错乱
    mutex_lock = threading.Lock()
    # 线程数为500,在一定范围内,线程数越多,速度越快。
    # 小心点,别变成ddos把验证服务器干死
    for i in range(500):
        t = threading.Thread(target=checkProxy, name='LoopThread' + str(i))
        t.start()
最后修改日期: 2021年3月8日

作者

留言

撰写回覆或留言

发布留言必须填写的电子邮件地址不会公开。

CAPTCHAis initialing...