Add maxlaw PC spider and shared proxy limiter

This commit is contained in:
hello-dd-code
2026-04-03 16:06:28 +08:00
parent ff5e04d986
commit ba04fe42fc
9 changed files with 651 additions and 78 deletions
+170 -55
View File
@@ -1,76 +1,191 @@
"""
全局请求速率限制器
确保代理每秒不超过5次请求
默认按“所有爬虫进程共享一个桶”来限流,避免 `bash start.sh`
同时启动多个进程时,每个进程各自 5 次/秒,叠加后把代理冲爆。
"""
from contextlib import contextmanager
import json
import os
import tempfile
import time
import threading
from collections import deque
from pathlib import Path
from uuid import uuid4
import fcntl
class RateLimiter:
"""
令牌桶算法实现的速率限制器
基于文件锁的跨进程滑动窗口限流器。
- 同一台机器上的多个 Python 进程会共享同一个状态文件
- 同一个进程内的多个线程也会一起走这个限流器
"""
def __init__(self, max_requests_per_second: int = 5):
"""
初始化速率限制器
Args:
max_requests_per_second: 每秒最大请求数
"""
self.max_requests = max_requests_per_second
self.requests = deque()
self.lock = threading.RLock()
def acquire(self):
"""
获取请求权限,如果需要则等待
"""
with self.lock:
now = time.time()
# 清理超过1秒的请求记录
while self.requests and now - self.requests[0] >= 1.0:
self.requests.popleft()
# 如果当前请求数已达上限,等待
if len(self.requests) >= self.max_requests:
# 计算需要等待的时间
wait_time = 1.0 - (now - self.requests[0])
if wait_time > 0:
time.sleep(wait_time)
return self.acquire() # 递归调用以重新检查
# 记录这次请求
self.requests.append(now)
def __init__(
self,
max_requests_per_second: int = 5,
window_seconds: float = 1.0,
state_file: str | None = None,
):
self.max_requests = max(1, int(max_requests_per_second))
self.max_concurrent = max(
1,
int(os.getenv("PROXY_MAX_CONCURRENT_REQUESTS", str(self.max_requests))),
)
self.window_seconds = max(0.1, float(window_seconds))
self.lease_seconds = max(
5.0,
float(os.getenv("PROXY_REQUEST_LEASE_SECONDS", "120")),
)
default_state = os.path.join(
tempfile.gettempdir(),
"lawyers_proxy_rate_limiter.json",
)
self.state_file = Path(
state_file or os.getenv("PROXY_RATE_LIMIT_FILE", default_state)
)
self.lock_file = self.state_file.with_suffix(self.state_file.suffix + ".lock")
self._thread_lock = threading.RLock()
self.state_file.parent.mkdir(parents=True, exist_ok=True)
self.lock_file.parent.mkdir(parents=True, exist_ok=True)
def _load_state(self) -> dict:
if not self.state_file.exists():
return {"timestamps": [], "leases": {}}
try:
raw = self.state_file.read_text(encoding="utf-8").strip()
if not raw:
return {"timestamps": [], "leases": {}}
data = json.loads(raw)
if isinstance(data, list):
return {
"timestamps": [float(item) for item in data],
"leases": {},
}
if not isinstance(data, dict):
return {"timestamps": [], "leases": {}}
timestamps = data.get("timestamps", []) or []
leases = data.get("leases", {}) or {}
return {
"timestamps": [float(item) for item in timestamps],
"leases": {str(key): float(value) for key, value in leases.items()},
}
except Exception:
return {"timestamps": [], "leases": {}}
def _save_state(self, state: dict) -> None:
payload = json.dumps(state, ensure_ascii=False)
self.state_file.write_text(payload, encoding="utf-8")
def _normalize_state(self, state: dict, now: float) -> dict:
timestamps = [
float(ts)
for ts in (state.get("timestamps", []) or [])
if now - float(ts) < self.window_seconds
]
leases = {
str(key): float(value)
for key, value in (state.get("leases", {}) or {}).items()
if now - float(value) < self.lease_seconds
}
return {"timestamps": timestamps, "leases": leases}
def acquire(self) -> None:
token = None
while True:
token = self.try_acquire_slot()
if token:
self.release(token)
return
time.sleep(0.05)
def try_acquire_slot(self) -> str | None:
while True:
wait_time = 0.0
with self._thread_lock:
with open(self.lock_file, "a+", encoding="utf-8") as lock_fp:
fcntl.flock(lock_fp.fileno(), fcntl.LOCK_EX)
now = time.time()
state = self._normalize_state(self._load_state(), now)
timestamps = state["timestamps"]
leases = state["leases"]
if len(timestamps) < self.max_requests and len(leases) < self.max_concurrent:
token = uuid4().hex
timestamps.append(now)
leases[token] = now
self._save_state(state)
fcntl.flock(lock_fp.fileno(), fcntl.LOCK_UN)
return token
wait_candidates = []
if len(timestamps) >= self.max_requests and timestamps:
wait_candidates.append(self.window_seconds - (now - timestamps[0]))
if len(leases) >= self.max_concurrent:
wait_candidates.append(0.05)
wait_time = max(0.05, min([item for item in wait_candidates if item > 0] or [0.05]))
fcntl.flock(lock_fp.fileno(), fcntl.LOCK_UN)
time.sleep(wait_time)
def release(self, token: str | None) -> None:
if not token:
return
with self._thread_lock:
with open(self.lock_file, "a+", encoding="utf-8") as lock_fp:
fcntl.flock(lock_fp.fileno(), fcntl.LOCK_EX)
now = time.time()
state = self._normalize_state(self._load_state(), now)
leases = state["leases"]
if token in leases:
leases.pop(token, None)
self._save_state(state)
else:
self._save_state(state)
fcntl.flock(lock_fp.fileno(), fcntl.LOCK_UN)
def can_make_request(self) -> bool:
"""
检查是否可以立即发起请求(非阻塞)
"""
with self.lock:
now = time.time()
# 清理超过1秒的请求记录
while self.requests and now - self.requests[0] >= 1.0:
self.requests.popleft()
return len(self.requests) < self.max_requests
with self._thread_lock:
with open(self.lock_file, "a+", encoding="utf-8") as lock_fp:
fcntl.flock(lock_fp.fileno(), fcntl.LOCK_EX)
now = time.time()
state = self._normalize_state(self._load_state(), now)
self._save_state(state)
allowed = (
len(state["timestamps"]) < self.max_requests
and len(state["leases"]) < self.max_concurrent
)
fcntl.flock(lock_fp.fileno(), fcntl.LOCK_UN)
return allowed
# 全局速率限制器实例
global_rate_limiter = RateLimiter(max_requests_per_second=5)
global_rate_limiter = RateLimiter(
max_requests_per_second=int(os.getenv("PROXY_MAX_REQUESTS_PER_SECOND", "5"))
)
def wait_for_request():
"""
等待直到可以发起请求
"""
"""等待直到可以发起请求。"""
global_rate_limiter.acquire()
def can_request_now() -> bool:
"""
检查是否可以立即发起请求
"""
"""检查是否可以立即发起请求。"""
return global_rate_limiter.can_make_request()
@contextmanager
def request_slot():
"""
申请一个跨进程共享的请求槽位,请求结束后自动释放。
这样既能限制“每秒启动多少请求”,也能限制“同时在飞多少请求”。
"""
token = global_rate_limiter.try_acquire_slot()
try:
yield
finally:
global_rate_limiter.release(token)