chore: initialize lawyers crawler project

This commit is contained in:
hello-dd-code
2026-03-02 00:19:48 +08:00
commit 03847a4b8e
17 changed files with 1928 additions and 0 deletions
+325
View File
@@ -0,0 +1,325 @@
import json
import os
import re
import sys
import time
import random
from typing import Dict, Optional
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(current_dir)
request_dir = os.path.join(project_root, "request")
if request_dir not in sys.path:
sys.path.insert(0, request_dir)
if project_root not in sys.path:
sys.path.append(project_root)
from bs4 import BeautifulSoup
from request.requests_client import RequestClientError, RequestsClient
from Db import Db
from config import HEADERS
LIST_URL = "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/"
DOMAIN = "华律"
class HualvSpider:
def __init__(self, db_connection):
self.db = db_connection
self.client = self._build_session()
self.areas = self._load_areas()
def _build_session(self) -> RequestsClient:
custom_headers = HEADERS.copy()
custom_headers['User-Agent'] = (
'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) '
'AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 '
'Mobile/15E148 Safari/604.1'
)
custom_headers["Connection"] = "close"
return RequestsClient(headers=custom_headers)
def _refresh_session(self) -> None:
self.client.refresh()
def _load_areas(self):
tables = ("area_new", "area2", "area")
last_error = None
for table in tables:
try:
provinces = self.db.select_data(
table,
"code, province, pinyin, id",
"domain='66law' AND level=1"
) or []
cities = self.db.select_data(
table,
"code, city, province, pid",
"domain='66law' AND level=2"
) or []
except Exception as exc:
last_error = exc
continue
if not cities:
continue
province_map = {p.get('id'): {"code": p.get('code'), "name": p.get('province')} for p in provinces}
city_map = {}
for city in cities:
province_info = province_map.get(city.get('pid'), {}) or {}
province_code = province_info.get('code')
city_map[city.get('code')] = {
"name": city.get('city'),
"province": city.get('province'),
"province_code": province_code,
}
print(f"[华律] 城市来源表: {table}, 城市数: {len(cities)}")
return city_map
if last_error:
print(f"[华律] 加载地区数据失败: {last_error}")
print("[华律] 无城市数据(已尝试 area_new/area2/area")
return {}
def _post(self, data: Dict[str, str], max_retries: int = 3) -> Optional[Dict]:
for attempt in range(max_retries):
try:
resp = self.client.post_text(LIST_URL, data=data, timeout=20, verify=False)
status_code = resp.status_code
text = resp.text
if status_code == 403:
if attempt < max_retries - 1:
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
self._refresh_session()
time.sleep(wait_time)
continue
print("请求失败: 403 Forbidden")
return None
if status_code >= 400:
raise RequestClientError(f"{status_code} Error")
try:
return json.loads(text)
except ValueError as exc:
print(f"解析JSON失败: {exc}")
return None
except RequestClientError as exc:
print(f"请求失败: {exc}")
return None
return None
def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
contact_url = f"{url}lawyer_contact.aspx"
print(f" 详情: {contact_url}")
existing = self.db.select_data(
"lawyer",
"id, avatar_url",
f"domain='{DOMAIN}' AND url='{contact_url}'"
)
existing_id = None
if existing:
existing_id = existing[0].get("id")
avatar = (existing[0].get("avatar_url") or "").strip()
if avatar:
print(" -- 已存在且头像已补全,跳过")
return None
html = self._get_detail(contact_url)
if not html:
return None
soup = BeautifulSoup(html, "html.parser")
info_list = soup.find("ul", class_="information-list")
if not info_list:
return None
phone = ""
law_firm = ""
for li in info_list.find_all("li"):
text = li.get_text(strip=True)
if "手机号" in text:
cleaned = text.replace("手机号", "").replace("(咨询请说明来自 华律网)", "").strip()
match = re.search(r"1\d{10}", cleaned.replace('-', '').replace(' ', ''))
if match:
phone = match.group(0)
if "执业单位" in text:
law_firm = text.replace("执业单位", "").strip()
name = ""
breadcrumb = soup.find("div", class_="weizhi")
if breadcrumb:
links = breadcrumb.find_all("a")
if len(links) > 2:
name = links[2].get_text(strip=True)
phone = phone.replace('-', '').strip()
if not phone or not re.fullmatch(r"1\d{10}", phone):
print(" 无手机号,跳过")
return None
avatar_url, site_time = self._extract_avatar_and_time(soup)
data = {
"phone": phone,
"province": province,
"city": city,
"law_firm": law_firm,
"url": contact_url,
"avatar_url": avatar_url,
"create_time": int(time.time()),
"site_time": site_time,
"domain": DOMAIN,
"name": name,
"params": json.dumps({"source": url}, ensure_ascii=False)
}
if existing_id:
update_data = {
"avatar_url": avatar_url,
"site_time": site_time,
}
if name:
update_data["name"] = name
if law_firm:
update_data["law_firm"] = law_firm
if province:
update_data["province"] = province
if city:
update_data["city"] = city
if phone:
update_data["phone"] = phone
update_data["params"] = json.dumps({"source": url}, ensure_ascii=False)
try:
self.db.update_data("lawyer", update_data, f"id={existing_id}")
print(" -- 已存在,已补全头像/时间")
except Exception as exc:
print(f" 更新失败: {exc}")
return None
# 若手机号已存在,则更新头像/时间,不再插入新记录
existing_phone = self.db.select_data(
"lawyer",
"id, avatar_url, url",
f"domain='{DOMAIN}' AND phone='{phone}'"
)
if existing_phone:
existing_row = existing_phone[0]
avatar = (existing_row.get("avatar_url") or "").strip()
if avatar:
print(" -- 已存在手机号且头像已补全,跳过")
return None
update_data = {
"avatar_url": avatar_url,
"site_time": site_time,
}
if name:
update_data["name"] = name
if law_firm:
update_data["law_firm"] = law_firm
if province:
update_data["province"] = province
if city:
update_data["city"] = city
if phone:
update_data["phone"] = phone
if not existing_row.get("url"):
update_data["url"] = contact_url
update_data["params"] = json.dumps({"source": url}, ensure_ascii=False)
try:
self.db.update_data("lawyer", update_data, f"id={existing_row.get('id')}")
print(" -- 已存在手机号,已补全头像/时间")
except Exception as exc:
print(f" 更新失败: {exc}")
return None
return data
def _extract_avatar_and_time(self, soup: BeautifulSoup) -> (str, Optional[int]):
avatar_url = ""
site_time = None
img_tag = soup.select_one(
"div.fixed-bottom-bar div.contact-lawye a.lr-photo img"
)
if img_tag:
src = (img_tag.get("src") or "").strip()
if src:
if src.startswith("//"):
avatar_url = f"https:{src}"
else:
avatar_url = src
match = re.search(r"/(20\d{2})(\d{2})/", avatar_url)
if match:
site_time = int(f"{match.group(1)}{match.group(2)}")
else:
match = re.search(r"(20\d{2})(\d{2})\d{2}", avatar_url)
if match:
site_time = int(f"{match.group(1)}{match.group(2)}")
return avatar_url, site_time
def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]:
for attempt in range(max_retries):
try:
resp = self.client.get_text(url, timeout=15, verify=False)
status_code = resp.status_code
text = resp.text
if status_code == 403:
if attempt < max_retries - 1:
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
print(f" 403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
self._refresh_session()
time.sleep(wait_time)
continue
print(" 请求失败: 403 Forbidden")
return None
if status_code >= 400:
raise RequestClientError(f"{status_code} Error")
return text
except RequestClientError as exc:
print(f" 请求失败: {exc}")
return None
return None
def run(self):
print("启动华律网采集...")
if not self.areas:
print("无城市数据")
return
for city_code, city_info in self.areas.items():
province_code = city_info.get("province_code")
if not province_code:
continue
province_name = city_info.get("province", "")
city_name = city_info.get("name", "")
print(f"采集 {province_name}-{city_name}")
page = 1
while True:
payload = {"pid": province_code, "cid": city_code, "page": str(page)}
data = self._post(payload)
if not data or not data.get("lawyerList"):
break
for item in data["lawyerList"]:
result = self._parse_detail(item.get("lawyerUrl", ""), province_name, city_name)
if not result:
continue
try:
self.db.insert_data("lawyer", result)
print(f" -> 新增: {result['name']} ({result['phone']})")
except Exception as exc:
print(f" 插入失败: {exc}")
time.sleep(1)
page_count = data.get("lawyerItems", {}).get("pageCount", page)
if page >= page_count:
break
page += 1
time.sleep(2)
time.sleep(1)
print("华律网采集完成")
if __name__ == "__main__":
with Db() as db:
spider = HualvSpider(db)
spider.run()