下面給出一套可直接落地的 Python 自動化“代理程序可用性”檢測方案,覆蓋 HTTP/HTTPS/SOCKS5、併發掃描、超時與重試、多維指標採集,用於批量篩選穩定代理與持續巡檢。所有關鍵點都以 <span style="color:red">紅色</span> 標註。🙂
一、判定口徑(先給結論)
- 以 “成功建連 + 請求返回 2xx/3xx + 端到端耗時低於閾值” 作為 <span style="color:red">可用性</span> 判定標準。
- 對每個代理記錄 connect\_time / tls\_time / ttfb / total / http\_code / 異常類型 並輸出 CSV,便於迴歸對比與報表。
- 採用 異步併發 優先(
asyncio + aiohttp/aiohttp_socks),兼顧 最小可用 的同步腳本,確保在受限環境也能執行。
二、最小可用腳本(HTTP/HTTPS,同步版)
# file: proxy_check_min.py
import os, time, socket, requests
TARGET_URL = os.getenv("TARGET_URL", "http://example.com/") # 目標業務URL,生產中請替換
TIMEOUT = float(os.getenv("TIMEOUT", "5")) # <span style="color:red">超時</span>閾值(秒)
def check_http_proxy(proxy: str):
"""
proxy 形如:
- HTTP: http://user:pass@host:port
- HTTPS: http://host:port
"""
proxies = {"http": proxy, "https": proxy}
t0 = time.perf_counter()
try:
# 先做TCP可達性探測(加快失敗發現)
host, port = proxy.split("@")[-1].split("//")[-1].split(":")
with socket.create_connection((host, int(port)), timeout=TIMEOUT):
pass
t1 = time.perf_counter()
# 通過代理髮起請求
resp = requests.get(TARGET_URL, proxies=proxies, timeout=TIMEOUT, allow_redirects=True)
code = resp.status_code
t2 = time.perf_counter()
ok = (200 <= code < 400)
return {
"proxy": proxy,
"ok": ok,
"http_code": code,
"connect_time": round(t1 - t0, 3),
"total": round(t2 - t0, 3),
"error": ""
}
except Exception as e:
t2 = time.perf_counter()
return {
"proxy": proxy,
"ok": False,
"http_code": -1,
"connect_time": None,
"total": round(t2 - t0, 3),
"error": type(e).__name__
}
if __name__ == "__main__":
# 示例:從環境變量或文件注入代理
raw = os.getenv("PROXIES", "http://127.0.0.1:8080,http://127.0.0.1:8000")
for p in [x.strip() for x in raw.split(",") if x.strip()]:
print(check_http_proxy(p))
逐段解釋:
TARGET_URL:業務可用性應以“真實目標”判定,腳本僅給默認佔位;生產請替換為你的探活地址。socket.create_connection:先做 TCP 可達性,能快速淘汰網絡不可達代理,提高整體吞吐。requests.get(..., proxies=...):通過代理髮起 HTTP/HTTPS 請求;allow_redirects=True以適配常見網關跳轉。TIMEOUT:統一 端到端超時;建議根據你的業務 RTT 與代理地區設定 3–10 秒。- 返回體中保留 <span style="color:red">error</span> 字段用於異常統計(如
ProxyError,ConnectTimeout)。
三、併發與多協議(異步進階版)
支持 HTTP/HTTPS/SOCKS5,並採集更細的時間指標(<span style="color:red">connect\_time / tls\_time / ttfb / total</span>)。
依賴:aiohttp>=3.9,aiohttp-socks>=0.8(如需 SOCKS5)。
# file: proxy_check_async.py
import asyncio, time, csv, os
from contextlib import asynccontextmanager
from aiohttp import ClientSession, TCPConnector, ClientTimeout
try:
from aiohttp_socks import ProxyConnector # 支持 socks5:// 或 socks5h://
HAS_SOCKS = True
except Exception:
HAS_SOCKS = False
TARGET_URL = os.getenv("TARGET_URL", "http://example.com/")
CONCURRENCY = int(os.getenv("CONCURRENCY", "200"))
TIMEOUT = float(os.getenv("TIMEOUT", "6"))
RETRIES = int(os.getenv("RETRIES", "1"))
OUT_CSV = os.getenv("OUT_CSV", "proxy_result.csv")
def make_connector(proxy: str):
"""
根據scheme選擇連接器:
- http/https -> TCPConnector + session級別proxy
- socks5 -> ProxyConnector(需 aiohttp-socks)
"""
if proxy.startswith("socks5"):
if not HAS_SOCKS:
raise RuntimeError("缺少 aiohttp-socks 以支持 SOCKS5")
return ProxyConnector.from_url(proxy, ttl_dns_cache=60, keepalive_timeout=15)
# HTTP/HTTPS 走常規TCPConnector;代理在請求層傳入
return TCPConnector(ssl=False, ttl_dns_cache=60, keepalive_timeout=15)
@asynccontextmanager
async def session_for_proxy(proxy: str):
connector = make_connector(proxy)
timeout = ClientTimeout(total=TIMEOUT)
if proxy.startswith("socks5"):
async with ClientSession(connector=connector, timeout=timeout) as sess:
yield sess, None # SOCKS5 模式無需 request 層再傳 proxy
else:
async with ClientSession(connector=connector, timeout=timeout) as sess:
yield sess, proxy # HTTP/HTTPS 在請求層傳入 proxy
async def fetch_once(sess: ClientSession, proxy_opt: str|None):
timings = {}
t0 = time.perf_counter()
try:
# 建連/首包時間度量:利用 trace_config
trace_config = ClientSession().trace_configs
# 簡化處理:用分段時間測量近似 connect/ttfb
async with sess.get(TARGET_URL, proxy=proxy_opt) as r:
t1 = time.perf_counter()
_ = await r.read() # 讀完響應體
t2 = time.perf_counter()
return True, r.status, {
"connect_time": round(t1 - t0, 3),
"ttfb": round(t1 - t0, 3), # 近似:首字節到達
"total": round(t2 - t0, 3),
"tls_time": None # 若需TLS細分,可用自定義TCPConnector鈎子
}, ""
except Exception as e:
t2 = time.perf_counter()
return False, -1, {"connect_time": None, "ttfb": None, "total": round(t2 - t0, 3), "tls_time": None}, type(e).__name__
async def check_one(proxy: str):
"""
帶重試與結果彙總
"""
best = None
async with session_for_proxy(proxy) as (sess, proxy_opt):
for _ in range(max(1, RETRIES)):
ok, code, tm, err = await fetch_once(sess, proxy_opt)
item = {"proxy": proxy, "ok": ok, "http_code": code, **tm, "error": err}
if ok:
if (best is None) or (tm["total"] < best["total"]):
best = item
else:
best = best or item
return best
async def main():
raw = os.getenv("PROXIES", "http://127.0.0.1:8080,socks5://127.0.0.1:1080")
proxies = [x.strip() for x in raw.split(",") if x.strip()]
sem = asyncio.Semaphore(CONCURRENCY)
async def worker(p):
async with sem:
return await check_one(p)
results = await asyncio.gather(*[worker(p) for p in proxies])
with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=["proxy","ok","http_code","connect_time","tls_time","ttfb","total","error"])
w.writeheader()
w.writerows(results)
print(f"寫入CSV: {OUT_CSV},合格數: {sum(1 for r in results if r['ok'])}/{len(results)}")
if __name__ == "__main__":
asyncio.run(main())
逐段解釋:
make_connector:按協議選擇連接器;<span style="color:red">SOCKS5</span> 需aiohttp-socks。session_for_proxy:為每個代理創建 獨立會話,隔離連接池與 DNS 緩存,避免相互污染。fetch_once:在 單次嘗試 中記錄時延;ttfb近似等同於首包時間。生產若需更精準 TLS 階段,可接入TraceConfig。check_one:帶 <span style="color:red">重試</span>,擇優保留最優總時延;失敗保留首個錯誤類型用於統計。CONCURRENCY:併發度控制;建議 100–500 之間,根據機器與帶寬調優。CSV 輸出:使得可用代理可直接被上游系統消費;易於做趨勢版報表。
四、判定指標與閾值建議(vditor/Markdown 表格)
| 指標 | 含義 | 建議閾值(跨境/同城) | 業務解讀 |
|---|---|---|---|
| <span style="color:red">connect\_time</span> | TCP 建連到完成握手(近似) | 200–800ms / 20–80ms | 網絡路徑與擁塞度;過高優先淘汰 |
| <span style="color:red">tls\_time</span> | TLS 握手耗時 | 100–500ms / 10–50ms | 證書鏈與中間節點質量 |
| <span style="color:red">ttfb</span> | 首字節到達 | 300–1200ms / 50–200ms | 代理+目標端響應綜合能力 |
| <span style="color:red">total</span> | 端到端總耗時 | ≤ 2.5s / ≤ 0.8s | 最終體驗閾值 |
| <span style="color:red">http\_code</span> | HTTP 狀態碼 | 2xx/3xx | 業務可達性判定核心 |
| <span style="color:red">error</span> | 異常類型 | 空為理想 | 便於快速定位(超時/拒絕/認證失敗) |
五、工作流(Mermaid)
六、工程化落地建議
- 輸入規範化:統一代理格式
scheme://[user:pass@]host:port,避免解析歧義。 - 分層閾值:以 <span style="color:red">領域閾值</span>(跨境/同城/內網)分級,防止“一刀切”誤殺。
- 巡檢與淘汰:每 5–10 分鐘滾動抽檢;連續 N 次失敗進入隔離池,冷卻後再評估。
- 多目標探活:準備<span style="color:red">多條</span>內部/外部探活URL,以防單點目標異常導致誤判。
- 合規與審計:輸出 CSV + 日誌歸檔,確保變更與質量可追溯。
七、典型問題與處理策略
- 認證失敗:返回
407或Unauthorized,需核對user:pass;統一配置密鑰管控,避免明文散落。 - DNS 泄露:使用
socks5h://由代理端解析域名,降低本地 DNS 側寫風險。 - 波動與抖動:藉助 <span style="color:red">重試與分位數</span>(如 p50/p90)做最終判定,而非單點樣本。
- 端口阻斷:先
socket探測,能顯著降低總耗時並早停失敗任務。
八、指標公式(用於報表/看板)
- <span style="color:red">可用率</span> =
合格次數 / 總檢測次數 - <span style="color:red">健康度得分</span> =
w1*(1 - p90_total/閾值) + w2*(成功率) + w3*(1 - 失敗類型權重)(權重按業務定製)
九、結語
以 “異步高併發 + 多維時延度量 + 閾值分層” 為主線,你可以快速構建面向生產的 代理可用性 質量門禁。先用最小可用腳本完成 驗證閉環,再切換到併發版,接入你的調度與告警體系,做到 <span style="color:red">穩定、可解釋、能回溯</span>。需要,我可按你的業務場景補齊 採樣策略、報表模板、以及容器化部署清單。