Files
lawyers/common_sites/findlaw.py
T
2026-04-03 16:06:28 +08:00

227 lines
8.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
import os
import sys
import time
import random
from typing import Dict, List, Set, Optional
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(current_dir)
request_dir = os.path.join(project_root, "request")
if request_dir not in sys.path:
sys.path.insert(0, request_dir)
if project_root not in sys.path:
sys.path.append(project_root)
import requests
from request.proxy_config import get_proxies, report_proxy_status
from Db import Db
from utils.rate_limiter import request_slot
DOMAIN = "找法网"
LIST_TEMPLATE = "https://m.findlaw.cn/{pinyin}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"
class FindlawSpider:
def __init__(self, db_connection):
self.db = db_connection
self.session = self._build_session()
self.cities = self._load_cities()
def _build_session(self) -> requests.Session:
report_proxy_status()
session = requests.Session()
session.trust_env = False
proxies = get_proxies()
if proxies:
session.proxies.update(proxies)
else:
session.proxies.clear()
session.headers.update({
"User-Agent": (
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
"Mobile/15E148 Safari/604.1"
),
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
"Connection": "close",
})
return session
def _refresh_session(self) -> None:
try:
self.session.close()
except Exception:
pass
self.session = self._build_session()
def _get(self, url: str, referer: str, verify: bool = True, max_retries: int = 3) -> Optional[str]:
headers = {"Referer": referer}
for attempt in range(max_retries):
try:
with request_slot():
resp = self.session.get(url, timeout=15, verify=verify, headers=headers)
status_code = resp.status_code
text = resp.text
resp.close()
if status_code == 403:
if attempt < max_retries - 1:
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
self._refresh_session()
time.sleep(wait_time)
continue
print(f"请求失败 {url}: 403 Forbidden")
return None
if status_code >= 400:
raise requests.exceptions.HTTPError(f"{status_code} Error: {url}")
return text
except requests.exceptions.SSLError:
if verify:
return self._get(url, referer, verify=False, max_retries=max_retries)
print(f"SSL错误 {url}")
return None
except requests.exceptions.RequestException as exc:
print(f"请求失败 {url}: {exc}")
return None
return None
def _existing_phones(self, phones: List[str]) -> Set[str]:
if not phones:
return set()
existing: Set[str] = set()
cur = self.db.db.cursor()
try:
chunk_size = 500
for i in range(0, len(phones), chunk_size):
chunk = phones[i:i + chunk_size]
placeholders = ",".join(["%s"] * len(chunk))
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
cur.execute(sql, [DOMAIN, *chunk])
for row in cur.fetchall():
existing.add(row[0])
finally:
cur.close()
return existing
def _load_cities(self):
condition = "domain='findlaw' AND level=2"
tables = ("area_new", "area2", "area")
last_error = None
for table in tables:
try:
rows = self.db.select_data(table, "city, province, pinyin", condition) or []
except Exception as exc:
last_error = exc
continue
if rows:
missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
print(f"[找法网] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
return rows
if last_error:
print(f"[找法网] 加载地区数据失败: {last_error}")
print("[找法网] 无城市数据(已尝试 area_new/area2/area")
for table in tables:
try:
cnt = self.db.select_data(table, "COUNT(*) AS cnt", condition)
c = (cnt[0].get("cnt") if cnt else 0) if isinstance(cnt, (list, tuple)) else 0
print(f"[找法网] 校验: {table} 满足条件记录数: {c}")
except Exception:
pass
return []
def _fetch_page(self, url: str, referer: str) -> List[Dict]:
text = self._get(url, referer, verify=True)
if not text:
return []
try:
# 某些返回体前会携带 BOM 或包装脚本,此处做兼容
text = text.strip().lstrip("\ufeff")
try:
data = json.loads(text)
except ValueError:
json_start = text.find('{')
json_end = text.rfind('}')
if json_start == -1 or json_end == -1:
print(f"解析JSON失败 {url}: 返回内容开头: {text[:80]!r}")
return []
cleaned = text[json_start:json_end + 1]
data = json.loads(cleaned)
if isinstance(data, str):
try:
data = json.loads(data)
except ValueError:
print(f"解析JSON失败 {url}: 二次解析仍为字符串,开头: {str(data)[:80]!r}")
return []
except ValueError as exc:
print(f"解析JSON失败 {url}: {exc}")
return []
items = data.get("data", {}).get("lawyer_list", [])
parsed = []
for item in items:
phone = (item.get("mobile") or "").replace("-", "")
parsed.append({
"name": item.get("username", ""),
"law_firm": item.get("lawyer_lawroom", ""),
"province": item.get("areaInfo", {}).get("province", ""),
"city": item.get("areaInfo", {}).get("city", ""),
"phone": phone,
"url": url,
"domain": DOMAIN,
"create_time": int(time.time()),
"params": json.dumps(item, ensure_ascii=False)
})
return parsed
def run(self):
print("启动找法网采集...")
if not self.cities:
print("无城市数据")
return
for city in self.cities:
pinyin = city.get("pinyin")
province = city.get("province", "")
city_name = city.get("city", "")
if not pinyin:
continue
print(f"采集 {province}-{city_name}")
page = 1
while True:
url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
referer = f"https://m.findlaw.cn/{pinyin}/q_lawyer/"
print(f"{page} 页: {url}")
items = self._fetch_page(url, referer)
if not items:
break
phones = [it.get("phone") for it in items if (it.get("phone") or "").strip()]
existing = self._existing_phones(phones)
for entry in items:
phone = entry.get("phone")
if not phone:
continue
if phone in existing:
print(f" -- 已存在: {entry['name']} ({phone})")
continue
try:
self.db.insert_data("lawyer", entry)
print(f" -> 新增: {entry['name']} ({phone})")
except Exception as exc:
print(f" 插入失败: {exc}")
page += 1
print("找法网采集完成")
if __name__ == "__main__":
with Db() as db:
spider = FindlawSpider(db)
spider.run()