282 lines
10 KiB
Python
282 lines
10 KiB
Python
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
import random
|
|
from typing import Dict, Optional
|
|
|
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
project_root = os.path.dirname(current_dir)
|
|
request_dir = os.path.join(project_root, "request")
|
|
if request_dir not in sys.path:
|
|
sys.path.insert(0, request_dir)
|
|
if project_root not in sys.path:
|
|
sys.path.append(project_root)
|
|
|
|
import requests
|
|
from requests.adapters import HTTPAdapter
|
|
from urllib3.util.retry import Retry
|
|
import urllib3
|
|
from bs4 import BeautifulSoup
|
|
from request.proxy_config import get_proxies, report_proxy_status
|
|
|
|
# 禁用 SSL 警告
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
|
|
from Db import Db
|
|
from utils.rate_limiter import request_slot
|
|
|
|
DOMAIN = "大律师"
|
|
LIST_TEMPLATE = "https://m.maxlaw.cn/law/{pinyin}?page={page}"
|
|
_PROXY_TESTED = False
|
|
|
|
|
|
class DlsSpider:
|
|
def __init__(self, db_connection):
|
|
self.db = db_connection
|
|
self.session = self._build_session()
|
|
self.areas = self._load_areas()
|
|
|
|
def _build_session(self) -> requests.Session:
|
|
"""构建带重试机制的 session"""
|
|
report_proxy_status()
|
|
s = requests.Session()
|
|
s.trust_env = False
|
|
proxies = get_proxies()
|
|
if proxies:
|
|
s.proxies.update(proxies)
|
|
else:
|
|
s.proxies.clear()
|
|
self._proxy_test(s, proxies)
|
|
# 配置重试策略
|
|
retries = Retry(
|
|
total=3, # 总共重试3次
|
|
backoff_factor=1, # 重试间隔:1s, 2s, 4s
|
|
status_forcelist=(429, 500, 502, 503, 504), # 对这些状态码进行重试
|
|
allowed_methods=frozenset(["GET", "POST"]),
|
|
raise_on_status=False # 不立即抛出异常,让代码处理
|
|
)
|
|
adapter = HTTPAdapter(max_retries=retries)
|
|
s.mount("https://", adapter)
|
|
s.mount("http://", adapter)
|
|
s.headers.update({
|
|
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
|
|
"Host": "m.maxlaw.cn",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
"Connection": "close",
|
|
})
|
|
return s
|
|
|
|
def _refresh_session(self) -> None:
|
|
try:
|
|
self.session.close()
|
|
except Exception:
|
|
pass
|
|
self.session = self._build_session()
|
|
|
|
def _proxy_test(self, session: requests.Session, proxies: Optional[Dict[str, str]]) -> None:
|
|
global _PROXY_TESTED
|
|
if _PROXY_TESTED or not os.getenv("PROXY_TEST"):
|
|
return
|
|
_PROXY_TESTED = True
|
|
if not proxies:
|
|
print("[proxy] test skipped: no proxy configured")
|
|
return
|
|
test_url = os.getenv("PROXY_TEST_URL", "https://dev.kdlapi.com/testproxy")
|
|
timeout = float(os.getenv("PROXY_TEST_TIMEOUT", "10"))
|
|
try:
|
|
resp = session.get(
|
|
test_url,
|
|
timeout=timeout,
|
|
headers={"Connection": "close"},
|
|
)
|
|
print(f"[proxy] test {resp.status_code}: {resp.text.strip()[:200]}")
|
|
except Exception as exc:
|
|
print(f"[proxy] test failed: {exc}")
|
|
|
|
def _load_areas(self):
|
|
try:
|
|
return self.db.select_data(
|
|
"area_new",
|
|
"province, city, pinyin",
|
|
"domain='maxlaw'"
|
|
) or []
|
|
except Exception as exc:
|
|
print(f"加载地区失败: {exc}")
|
|
return []
|
|
|
|
def _get(self, url: str, max_retries: int = 3, headers: Optional[Dict[str, str]] = None) -> Optional[str]:
|
|
"""发送 GET 请求,带重试机制"""
|
|
for attempt in range(max_retries):
|
|
try:
|
|
# 使用更长的超时时间,分别设置连接和读取超时
|
|
with request_slot():
|
|
resp = self.session.get(
|
|
url,
|
|
timeout=(10, 30), # (connect_timeout, read_timeout)
|
|
verify=False,
|
|
headers=headers,
|
|
)
|
|
status_code = resp.status_code
|
|
content = resp.text
|
|
resp.close()
|
|
if status_code == 403:
|
|
if attempt < max_retries - 1:
|
|
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
|
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
|
self._refresh_session()
|
|
time.sleep(wait_time)
|
|
continue
|
|
print(f"请求失败 {url}: 403 Forbidden")
|
|
return None
|
|
if status_code >= 400:
|
|
raise requests.exceptions.HTTPError(f"{status_code} Error: {url}")
|
|
return content
|
|
except requests.exceptions.ConnectTimeout as exc:
|
|
if attempt < max_retries - 1:
|
|
wait_time = 2 ** attempt # 指数退避:2s, 4s, 8s
|
|
print(f"连接超时,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
|
time.sleep(wait_time)
|
|
else:
|
|
print(f"连接超时,已达到最大重试次数 {url}: {exc}")
|
|
return None
|
|
except requests.exceptions.Timeout as exc:
|
|
if attempt < max_retries - 1:
|
|
wait_time = 2 ** attempt
|
|
print(f"请求超时,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
|
time.sleep(wait_time)
|
|
else:
|
|
print(f"请求超时,已达到最大重试次数 {url}: {exc}")
|
|
return None
|
|
except requests.exceptions.ConnectionError as exc:
|
|
if attempt < max_retries - 1:
|
|
wait_time = 2 ** attempt
|
|
print(f"连接错误,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
|
time.sleep(wait_time)
|
|
else:
|
|
print(f"连接错误,已达到最大重试次数 {url}: {exc}")
|
|
return None
|
|
except requests.exceptions.RequestException as exc:
|
|
print(f"请求失败 {url}: {exc}")
|
|
return None
|
|
|
|
return None
|
|
|
|
def _parse_list(self, html: str, province: str, city: str, list_url: str) -> int:
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
cards = soup.find_all("div", class_="lstx")
|
|
if not cards:
|
|
return 0
|
|
|
|
inserted = 0
|
|
for card in cards:
|
|
link = card.find("a")
|
|
if not link or not link.get("href"):
|
|
continue
|
|
detail = self._parse_detail(link['href'], province, city, list_url)
|
|
if not detail:
|
|
continue
|
|
phone = detail.get("phone")
|
|
if not phone:
|
|
continue
|
|
condition = f"phone='{phone}' and domain='{DOMAIN}'"
|
|
if self.db.is_data_exist("lawyer", condition):
|
|
print(f" -- 已存在: {detail['name']} ({phone})")
|
|
time.sleep(0.3)
|
|
continue
|
|
try:
|
|
self.db.insert_data("lawyer", detail)
|
|
inserted += 1
|
|
print(f" -> 新增: {detail['name']} ({phone})")
|
|
except Exception as exc:
|
|
print(f" 插入失败: {exc}")
|
|
time.sleep(1)
|
|
time.sleep(0.3)
|
|
# 列表页结束后再缓一缓,降低风控
|
|
time.sleep(0.6)
|
|
return inserted
|
|
|
|
def _detail_headers(self, referer: str) -> Dict[str, str]:
|
|
return {
|
|
"Referer": referer,
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
"Cache-Control": "no-cache",
|
|
"Pragma": "no-cache",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
}
|
|
|
|
def _parse_detail(self, path: str, province: str, city: str, list_url: str) -> Optional[Dict[str, str]]:
|
|
url = f"https://m.maxlaw.cn{path}"
|
|
print(f" 详情: {url}")
|
|
html = self._get(url, headers=self._detail_headers(list_url))
|
|
if not html:
|
|
return None
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
name_tag = soup.find("h2", class_="lawyerName")
|
|
law_firm_tag = soup.find("p", class_="law-firm")
|
|
contact_list = soup.find("ul", class_="contact-content")
|
|
|
|
name = name_tag.get_text(strip=True) if name_tag else ""
|
|
law_firm = law_firm_tag.get_text(strip=True) if law_firm_tag else ""
|
|
phone = ""
|
|
|
|
if contact_list:
|
|
items = contact_list.find_all("li")
|
|
if len(items) > 2:
|
|
phone_tag = items[2].find("p")
|
|
if phone_tag:
|
|
phone = phone_tag.get_text(strip=True)
|
|
phone = phone.split("咨询请说明来自大律师网")[0].strip()
|
|
|
|
phone = phone.replace('-', '').strip()
|
|
if not name or not phone:
|
|
print(" 信息不完整,跳过")
|
|
return None
|
|
|
|
safe_city = city if city else province
|
|
return {
|
|
"name": name,
|
|
"law_firm": law_firm,
|
|
"province": province,
|
|
"city": safe_city,
|
|
"phone": phone,
|
|
"url": url,
|
|
"domain": DOMAIN,
|
|
"create_time": int(time.time()),
|
|
"params": json.dumps({"province": province, "city": safe_city}, ensure_ascii=False)
|
|
}
|
|
|
|
def run(self):
|
|
print("启动大律师采集...")
|
|
if not self.areas:
|
|
print("无地区数据")
|
|
return
|
|
|
|
for area in self.areas:
|
|
pinyin = area.get("pinyin")
|
|
province = area.get("province", "")
|
|
city = area.get("city", "")
|
|
if not pinyin:
|
|
continue
|
|
page = 1
|
|
while True:
|
|
list_url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
|
|
print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
|
|
html = self._get(list_url)
|
|
if not html:
|
|
break
|
|
inserted = self._parse_list(html, province, city, list_url)
|
|
if inserted == 0:
|
|
break
|
|
page += 1
|
|
print("大律师采集完成")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
with Db() as db:
|
|
spider = DlsSpider(db)
|
|
spider.run()
|