重构采集脚本并新增按时间导出Excel
- 统一五个站点采集逻辑与启动脚本\n- 新增 dls_fresh 采集流程与日志优化\n- 新增 export_lawyers_excel 按时间条件导出\n- 默认导出近7天并支持扩展字段解析\n- 整理 .gitignore,忽略 data/logs 本地产物
This commit is contained in:
+606
-272
@@ -1,10 +1,18 @@
|
||||
import argparse
|
||||
import ast
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import random
|
||||
from typing import Dict, Optional
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(current_dir)
|
||||
@@ -14,312 +22,638 @@ if request_dir not in sys.path:
|
||||
if project_root not in sys.path:
|
||||
sys.path.append(project_root)
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from request.requests_client import RequestClientError, RequestsClient
|
||||
|
||||
from Db import Db
|
||||
from config import HEADERS
|
||||
from request.requests_client import RequestClientError, RequestsClient
|
||||
from utils.rate_limiter import wait_for_request
|
||||
|
||||
LIST_URL = "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/"
|
||||
DOMAIN = "华律"
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
SITE_NAME = "hualv"
|
||||
LEGACY_DOMAIN = "华律"
|
||||
SITE_BASE = "https://m.66law.cn"
|
||||
CITY_DATA_URL = "https://cache.66law.cn/dist/main-v2.0.js"
|
||||
LIST_API_URL = "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/"
|
||||
|
||||
PHONE_RE = re.compile(r"1[3-9]\d{9}")
|
||||
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
||||
YEAR_RE = re.compile(r"执业\s*(\d+)\s*年")
|
||||
|
||||
|
||||
class HualvSpider:
|
||||
def __init__(self, db_connection):
|
||||
@dataclass
|
||||
class CityTarget:
|
||||
province_id: int
|
||||
province_name: str
|
||||
city_id: int
|
||||
city_name: str
|
||||
|
||||
|
||||
def normalize_phone(text: str) -> str:
|
||||
compact = re.sub(r"\D", "", text or "")
|
||||
match = PHONE_RE.search(compact)
|
||||
return match.group(0) if match else ""
|
||||
|
||||
|
||||
def strip_html_tags(text: str) -> str:
|
||||
return re.sub(r"<[^>]+>", "", text or "").strip()
|
||||
|
||||
|
||||
class HualvCrawler:
|
||||
def __init__(
|
||||
self,
|
||||
max_pages: int = 9999,
|
||||
sleep_seconds: float = 0.15,
|
||||
use_proxy: bool = True,
|
||||
db_connection=None,
|
||||
):
|
||||
self.max_pages = max_pages
|
||||
self.sleep_seconds = max(0.0, sleep_seconds)
|
||||
self.db = db_connection
|
||||
self.client = self._build_session()
|
||||
self.areas = self._load_areas()
|
||||
|
||||
def _build_session(self) -> RequestsClient:
|
||||
custom_headers = HEADERS.copy()
|
||||
custom_headers['User-Agent'] = (
|
||||
'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) '
|
||||
'AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 '
|
||||
'Mobile/15E148 Safari/604.1'
|
||||
self.client = RequestsClient(
|
||||
headers={
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
||||
"Mobile/15E148 Safari/604.1"
|
||||
),
|
||||
"Accept": "application/json, text/javascript, */*; q=0.01",
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
"Connection": "close",
|
||||
},
|
||||
use_proxy=use_proxy,
|
||||
retry_total=2,
|
||||
retry_backoff_factor=1,
|
||||
retry_status_forcelist=(429, 500, 502, 503, 504),
|
||||
retry_allowed_methods=("GET", "POST"),
|
||||
)
|
||||
custom_headers["Connection"] = "close"
|
||||
return RequestsClient(headers=custom_headers)
|
||||
|
||||
def _refresh_session(self) -> None:
|
||||
self.client.refresh()
|
||||
def _request_text(
|
||||
self,
|
||||
method: str,
|
||||
url: str,
|
||||
*,
|
||||
timeout: int = 20,
|
||||
max_retries: int = 3,
|
||||
referer: str = SITE_BASE,
|
||||
data: Optional[Dict] = None,
|
||||
) -> str:
|
||||
headers = {"Referer": referer}
|
||||
last_error: Optional[Exception] = None
|
||||
|
||||
def _load_areas(self):
|
||||
tables = ("area_new", "area2", "area")
|
||||
last_error = None
|
||||
for table in tables:
|
||||
for attempt in range(max_retries):
|
||||
wait_for_request()
|
||||
try:
|
||||
provinces = self.db.select_data(
|
||||
table,
|
||||
"code, province, pinyin, id",
|
||||
"domain='66law' AND level=1"
|
||||
) or []
|
||||
cities = self.db.select_data(
|
||||
table,
|
||||
"code, city, province, pid",
|
||||
"domain='66law' AND level=2"
|
||||
) or []
|
||||
if method.upper() == "POST":
|
||||
resp = self.client.post_text(
|
||||
url,
|
||||
timeout=timeout,
|
||||
verify=False,
|
||||
headers=headers,
|
||||
data=data,
|
||||
)
|
||||
else:
|
||||
resp = self.client.get_text(
|
||||
url,
|
||||
timeout=timeout,
|
||||
verify=False,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
code = resp.status_code
|
||||
if code == 403:
|
||||
if attempt < max_retries - 1:
|
||||
self.client.refresh()
|
||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||
continue
|
||||
raise RequestClientError(f"{code} Error: {url}")
|
||||
if code >= 500 and attempt < max_retries - 1:
|
||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||
continue
|
||||
if code >= 400:
|
||||
raise RequestClientError(f"{code} Error: {url}")
|
||||
return resp.text
|
||||
except Exception as exc:
|
||||
last_error = exc
|
||||
continue
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
|
||||
continue
|
||||
raise
|
||||
|
||||
if not cities:
|
||||
continue
|
||||
if last_error is not None:
|
||||
raise last_error
|
||||
raise RequestClientError(f"Unknown request error: {url}")
|
||||
|
||||
province_map = {p.get('id'): {"code": p.get('code'), "name": p.get('province')} for p in provinces}
|
||||
city_map = {}
|
||||
for city in cities:
|
||||
province_info = province_map.get(city.get('pid'), {}) or {}
|
||||
province_code = province_info.get('code')
|
||||
city_map[city.get('code')] = {
|
||||
"name": city.get('city'),
|
||||
"province": city.get('province'),
|
||||
"province_code": province_code,
|
||||
}
|
||||
print(f"[华律] 城市来源表: {table}, 城市数: {len(cities)}")
|
||||
return city_map
|
||||
|
||||
if last_error:
|
||||
print(f"[华律] 加载地区数据失败: {last_error}")
|
||||
print("[华律] 无城市数据(已尝试 area_new/area2/area)")
|
||||
return {}
|
||||
|
||||
def _post(self, data: Dict[str, str], max_retries: int = 3) -> Optional[Dict]:
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
resp = self.client.post_text(LIST_URL, data=data, timeout=20, verify=False)
|
||||
status_code = resp.status_code
|
||||
text = resp.text
|
||||
if status_code == 403:
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||||
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
|
||||
self._refresh_session()
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
print("请求失败: 403 Forbidden")
|
||||
return None
|
||||
if status_code >= 400:
|
||||
raise RequestClientError(f"{status_code} Error")
|
||||
try:
|
||||
return json.loads(text)
|
||||
except ValueError as exc:
|
||||
print(f"解析JSON失败: {exc}")
|
||||
return None
|
||||
except RequestClientError as exc:
|
||||
print(f"请求失败: {exc}")
|
||||
return None
|
||||
return None
|
||||
|
||||
def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
|
||||
contact_url = f"{url}lawyer_contact.aspx"
|
||||
print(f" 详情: {contact_url}")
|
||||
existing = self.db.select_data(
|
||||
"lawyer",
|
||||
"id, avatar_url",
|
||||
f"domain='{DOMAIN}' AND url='{contact_url}'"
|
||||
def _get_text(self, url: str, *, timeout: int = 20, max_retries: int = 3, referer: str = SITE_BASE) -> str:
|
||||
return self._request_text(
|
||||
"GET",
|
||||
url,
|
||||
timeout=timeout,
|
||||
max_retries=max_retries,
|
||||
referer=referer,
|
||||
)
|
||||
existing_id = None
|
||||
if existing:
|
||||
existing_id = existing[0].get("id")
|
||||
avatar = (existing[0].get("avatar_url") or "").strip()
|
||||
if avatar:
|
||||
print(" -- 已存在且头像已补全,跳过")
|
||||
return None
|
||||
|
||||
html = self._get_detail(contact_url)
|
||||
if not html:
|
||||
return None
|
||||
def _post_text(
|
||||
self,
|
||||
url: str,
|
||||
*,
|
||||
data: Dict,
|
||||
timeout: int = 20,
|
||||
max_retries: int = 3,
|
||||
referer: str = SITE_BASE,
|
||||
) -> str:
|
||||
return self._request_text(
|
||||
"POST",
|
||||
url,
|
||||
timeout=timeout,
|
||||
max_retries=max_retries,
|
||||
referer=referer,
|
||||
data=data,
|
||||
)
|
||||
|
||||
def _extract_spc_location(self, script_text: str) -> List:
|
||||
# main-v2.js 内置了 sPCLocation=new Array(...),后面紧跟 cateinfo 数组
|
||||
marker = "sPCLocation = new Array("
|
||||
start = script_text.find(marker)
|
||||
if start == -1:
|
||||
marker = "sPCLocation=new Array("
|
||||
start = script_text.find(marker)
|
||||
if start == -1:
|
||||
return []
|
||||
start += len(marker)
|
||||
|
||||
next_marker = script_text.find("cateinfo = new Array(", start)
|
||||
if next_marker == -1:
|
||||
next_marker = script_text.find("cateinfo=new Array(", start)
|
||||
|
||||
if next_marker != -1:
|
||||
end = script_text.rfind(");", start, next_marker)
|
||||
else:
|
||||
end = script_text.find(");", start)
|
||||
|
||||
if end == -1 or end <= start:
|
||||
return []
|
||||
|
||||
raw = "[" + script_text[start:end] + "]"
|
||||
try:
|
||||
data = ast.literal_eval(raw)
|
||||
except Exception:
|
||||
return []
|
||||
return data if isinstance(data, list) else []
|
||||
|
||||
def discover_cities(self) -> List[CityTarget]:
|
||||
script_text = self._get_text(CITY_DATA_URL, referer=SITE_BASE + "/findlawyer/")
|
||||
rows = self._extract_spc_location(script_text)
|
||||
|
||||
targets: List[CityTarget] = []
|
||||
seen: Set[Tuple[int, int]] = set()
|
||||
|
||||
for province in rows:
|
||||
if not isinstance(province, list) or len(province) < 3:
|
||||
continue
|
||||
try:
|
||||
province_id = int(province[0])
|
||||
except Exception:
|
||||
continue
|
||||
province_name = str(province[1] or "").strip()
|
||||
city_rows = province[2] if isinstance(province[2], list) else []
|
||||
|
||||
for city in city_rows:
|
||||
if not isinstance(city, list) or len(city) < 2:
|
||||
continue
|
||||
try:
|
||||
city_id = int(city[0])
|
||||
except Exception:
|
||||
continue
|
||||
city_name = str(city[1] or "").strip()
|
||||
if city_id <= 0 or not city_name:
|
||||
continue
|
||||
|
||||
key = (province_id, city_id)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
|
||||
targets.append(
|
||||
CityTarget(
|
||||
province_id=province_id,
|
||||
province_name=province_name,
|
||||
city_id=city_id,
|
||||
city_name=city_name,
|
||||
)
|
||||
)
|
||||
return targets
|
||||
|
||||
def fetch_list_page(self, target: CityTarget, page: int) -> Tuple[List[Dict], int]:
|
||||
payload = {
|
||||
"pid": str(target.province_id),
|
||||
"cid": str(target.city_id),
|
||||
"page": str(page),
|
||||
}
|
||||
text = self._post_text(
|
||||
LIST_API_URL,
|
||||
data=payload,
|
||||
referer=SITE_BASE + "/findlawyer/",
|
||||
)
|
||||
data = json.loads((text or "").strip().lstrip("\ufeff") or "{}")
|
||||
items = data.get("lawyerList") or data.get("queryLawyerList") or []
|
||||
if not isinstance(items, list):
|
||||
items = []
|
||||
|
||||
page_count = 0
|
||||
try:
|
||||
page_count = int((data.get("lawyerItems") or {}).get("pageCount") or 0)
|
||||
except Exception:
|
||||
page_count = 0
|
||||
return items, page_count
|
||||
|
||||
def parse_detail(self, detail_url: str) -> Dict:
|
||||
contact_url = detail_url.rstrip("/") + "/lawyer_contact.aspx"
|
||||
html = self._get_text(contact_url, referer=detail_url)
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
info_list = soup.find("ul", class_="information-list")
|
||||
if not info_list:
|
||||
return None
|
||||
|
||||
phone = ""
|
||||
law_firm = ""
|
||||
for li in info_list.find_all("li"):
|
||||
text = li.get_text(strip=True)
|
||||
if "手机号" in text:
|
||||
cleaned = text.replace("手机号", "").replace("(咨询请说明来自 华律网)", "").strip()
|
||||
match = re.search(r"1\d{10}", cleaned.replace('-', '').replace(' ', ''))
|
||||
if match:
|
||||
phone = match.group(0)
|
||||
if "执业单位" in text:
|
||||
law_firm = text.replace("执业单位", "").strip()
|
||||
full_text = soup.get_text(" ", strip=True)
|
||||
|
||||
name = ""
|
||||
breadcrumb = soup.find("div", class_="weizhi")
|
||||
if breadcrumb:
|
||||
links = breadcrumb.find_all("a")
|
||||
if len(links) > 2:
|
||||
name = links[2].get_text(strip=True)
|
||||
law_firm = ""
|
||||
phone = ""
|
||||
email = ""
|
||||
address = ""
|
||||
license_no = ""
|
||||
practice_years: Optional[int] = None
|
||||
|
||||
phone = phone.replace('-', '').strip()
|
||||
if not phone or not re.fullmatch(r"1\d{10}", phone):
|
||||
print(" 无手机号,跳过")
|
||||
name_tag = soup.select_one(".logo-box .title b")
|
||||
if name_tag:
|
||||
name = name_tag.get_text(strip=True).replace("律师", "").strip()
|
||||
if not name and soup.title:
|
||||
match = re.search(r"([^\s,,。_]+?)律师", soup.title.get_text(" ", strip=True))
|
||||
if match:
|
||||
name = match.group(1).strip()
|
||||
|
||||
phone_candidates = [
|
||||
soup.select_one(".logo-box .r-bar .tel").get_text(" ", strip=True)
|
||||
if soup.select_one(".logo-box .r-bar .tel")
|
||||
else "",
|
||||
soup.select_one(".lawyer-show ul.info").get_text(" ", strip=True)
|
||||
if soup.select_one(".lawyer-show ul.info")
|
||||
else "",
|
||||
full_text,
|
||||
]
|
||||
for candidate in phone_candidates:
|
||||
phone = normalize_phone(candidate)
|
||||
if phone:
|
||||
break
|
||||
|
||||
for li in soup.select(".lawyer-show ul.info li"):
|
||||
li_text = li.get_text(" ", strip=True)
|
||||
if ("事务所" in li_text or "法律服务所" in li_text) and not law_firm:
|
||||
law_firm = li_text
|
||||
|
||||
if not law_firm:
|
||||
match = re.search(r'"affiliation":\{"@type":"Organization","name":"([^"]+)"', html)
|
||||
if match:
|
||||
law_firm = match.group(1).strip()
|
||||
|
||||
match = re.search(r'"identifier":"([^"]+)"', html)
|
||||
if match:
|
||||
license_no = match.group(1).strip()
|
||||
|
||||
match = re.search(r'"streetAddress":"([^"]+)"', html)
|
||||
if match:
|
||||
address = match.group(1).strip()
|
||||
|
||||
email_match = EMAIL_RE.search(html)
|
||||
if email_match:
|
||||
email = email_match.group(0).strip()
|
||||
|
||||
year_match = YEAR_RE.search(full_text)
|
||||
if year_match:
|
||||
try:
|
||||
practice_years = int(year_match.group(1))
|
||||
except Exception:
|
||||
practice_years = None
|
||||
|
||||
specialties = [node.get_text(strip=True) for node in soup.select(".tag-h38 span")]
|
||||
specialties = [x for x in specialties if x]
|
||||
|
||||
return {
|
||||
"name": name,
|
||||
"law_firm": law_firm,
|
||||
"phone": phone,
|
||||
"email": email,
|
||||
"address": address,
|
||||
"license_no": license_no,
|
||||
"practice_years": practice_years,
|
||||
"specialties": specialties,
|
||||
"detail_url": detail_url,
|
||||
"contact_url": contact_url,
|
||||
}
|
||||
|
||||
def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
|
||||
seen_details: Set[str] = set()
|
||||
|
||||
for page in range(1, self.max_pages + 1):
|
||||
try:
|
||||
items, page_count = self.fetch_list_page(target, page)
|
||||
except Exception as exc:
|
||||
print(f"[list] 失败 pid={target.province_id} cid={target.city_id} p{page}: {exc}")
|
||||
break
|
||||
|
||||
if not items:
|
||||
break
|
||||
|
||||
for item in items:
|
||||
detail_url = str(item.get("lawyerUrl") or "").strip()
|
||||
if not detail_url:
|
||||
continue
|
||||
if detail_url.startswith("//"):
|
||||
detail_url = "https:" + detail_url
|
||||
if not detail_url.startswith("http"):
|
||||
detail_url = urljoin(SITE_BASE, detail_url)
|
||||
|
||||
if detail_url in seen_details:
|
||||
continue
|
||||
seen_details.add(detail_url)
|
||||
|
||||
try:
|
||||
detail = self.parse_detail(detail_url)
|
||||
except Exception as exc:
|
||||
print(f"[detail] 失败 {detail_url}: {exc}")
|
||||
continue
|
||||
|
||||
now = int(time.time())
|
||||
uid = str(item.get("lawyerId") or item.get("globalUserId") or detail_url)
|
||||
record_id = hashlib.md5(uid.encode("utf-8")).hexdigest()
|
||||
|
||||
list_name = str(item.get("name") or "").replace("律师", "").strip()
|
||||
category_text = str(item.get("categoryNames") or "").strip()
|
||||
category_arr = [x.strip() for x in re.split(r"[、,,]", category_text) if x.strip()]
|
||||
|
||||
yield {
|
||||
"record_id": record_id,
|
||||
"collected_at": now,
|
||||
"source": {
|
||||
"site": SITE_NAME,
|
||||
"province_id": target.province_id,
|
||||
"province": target.province_name,
|
||||
"city_id": target.city_id,
|
||||
"city": target.city_name,
|
||||
"page": page,
|
||||
"detail_url": detail_url,
|
||||
"contact_url": detail.get("contact_url", ""),
|
||||
},
|
||||
"list_snapshot": {
|
||||
"lawyer_id": item.get("lawyerId"),
|
||||
"name": list_name,
|
||||
"category_names": category_arr,
|
||||
"help_count": strip_html_tags(str(item.get("helpCount") or "")),
|
||||
"comment_score": strip_html_tags(str(item.get("commentScore") or "")),
|
||||
"response_time": str(item.get("responseTime") or "").strip(),
|
||||
"year": item.get("year"),
|
||||
"is_adv": bool(item.get("isAdv")),
|
||||
},
|
||||
"profile": {
|
||||
"name": detail.get("name") or list_name,
|
||||
"law_firm": detail.get("law_firm") or "",
|
||||
"phone": detail.get("phone") or "",
|
||||
"email": detail.get("email") or "",
|
||||
"address": detail.get("address") or "",
|
||||
"license_no": detail.get("license_no") or "",
|
||||
"practice_years": detail.get("practice_years"),
|
||||
"specialties": detail.get("specialties") or category_arr,
|
||||
},
|
||||
"raw": item,
|
||||
}
|
||||
|
||||
if self.sleep_seconds:
|
||||
time.sleep(self.sleep_seconds)
|
||||
|
||||
if page_count > 0 and page >= page_count:
|
||||
break
|
||||
|
||||
def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
|
||||
source = record.get("source", {}) or {}
|
||||
profile = record.get("profile", {}) or {}
|
||||
|
||||
phone = normalize_phone(profile.get("phone", ""))
|
||||
if not phone:
|
||||
return None
|
||||
|
||||
avatar_url, site_time = self._extract_avatar_and_time(soup)
|
||||
data = {
|
||||
"phone": phone,
|
||||
province = (source.get("province") or "").strip()
|
||||
city = (source.get("city") or province).strip()
|
||||
return {
|
||||
"name": (profile.get("name") or "").strip(),
|
||||
"law_firm": (profile.get("law_firm") or "").strip(),
|
||||
"province": province,
|
||||
"city": city,
|
||||
"law_firm": law_firm,
|
||||
"url": contact_url,
|
||||
"avatar_url": avatar_url,
|
||||
"create_time": int(time.time()),
|
||||
"site_time": site_time,
|
||||
"domain": DOMAIN,
|
||||
"name": name,
|
||||
"params": json.dumps({"source": url}, ensure_ascii=False)
|
||||
"phone": phone,
|
||||
"url": (source.get("contact_url") or source.get("detail_url") or "").strip(),
|
||||
"domain": LEGACY_DOMAIN,
|
||||
"create_time": int(record.get("collected_at") or time.time()),
|
||||
"params": json.dumps(record, ensure_ascii=False),
|
||||
}
|
||||
if existing_id:
|
||||
update_data = {
|
||||
"avatar_url": avatar_url,
|
||||
"site_time": site_time,
|
||||
}
|
||||
if name:
|
||||
update_data["name"] = name
|
||||
if law_firm:
|
||||
update_data["law_firm"] = law_firm
|
||||
if province:
|
||||
update_data["province"] = province
|
||||
if city:
|
||||
update_data["city"] = city
|
||||
if phone:
|
||||
update_data["phone"] = phone
|
||||
update_data["params"] = json.dumps({"source": url}, ensure_ascii=False)
|
||||
try:
|
||||
self.db.update_data("lawyer", update_data, f"id={existing_id}")
|
||||
print(" -- 已存在,已补全头像/时间")
|
||||
except Exception as exc:
|
||||
print(f" 更新失败: {exc}")
|
||||
return None
|
||||
# 若手机号已存在,则更新头像/时间,不再插入新记录
|
||||
existing_phone = self.db.select_data(
|
||||
"lawyer",
|
||||
"id, avatar_url, url",
|
||||
f"domain='{DOMAIN}' AND phone='{phone}'"
|
||||
)
|
||||
if existing_phone:
|
||||
existing_row = existing_phone[0]
|
||||
avatar = (existing_row.get("avatar_url") or "").strip()
|
||||
if avatar:
|
||||
print(" -- 已存在手机号且头像已补全,跳过")
|
||||
return None
|
||||
update_data = {
|
||||
"avatar_url": avatar_url,
|
||||
"site_time": site_time,
|
||||
}
|
||||
if name:
|
||||
update_data["name"] = name
|
||||
if law_firm:
|
||||
update_data["law_firm"] = law_firm
|
||||
if province:
|
||||
update_data["province"] = province
|
||||
if city:
|
||||
update_data["city"] = city
|
||||
if phone:
|
||||
update_data["phone"] = phone
|
||||
if not existing_row.get("url"):
|
||||
update_data["url"] = contact_url
|
||||
update_data["params"] = json.dumps({"source": url}, ensure_ascii=False)
|
||||
try:
|
||||
self.db.update_data("lawyer", update_data, f"id={existing_row.get('id')}")
|
||||
print(" -- 已存在手机号,已补全头像/时间")
|
||||
except Exception as exc:
|
||||
print(f" 更新失败: {exc}")
|
||||
return None
|
||||
return data
|
||||
|
||||
def _extract_avatar_and_time(self, soup: BeautifulSoup) -> (str, Optional[int]):
|
||||
avatar_url = ""
|
||||
site_time = None
|
||||
img_tag = soup.select_one(
|
||||
"div.fixed-bottom-bar div.contact-lawye a.lr-photo img"
|
||||
)
|
||||
if img_tag:
|
||||
src = (img_tag.get("src") or "").strip()
|
||||
if src:
|
||||
if src.startswith("//"):
|
||||
avatar_url = f"https:{src}"
|
||||
else:
|
||||
avatar_url = src
|
||||
match = re.search(r"/(20\d{2})(\d{2})/", avatar_url)
|
||||
if match:
|
||||
site_time = int(f"{match.group(1)}{match.group(2)}")
|
||||
else:
|
||||
match = re.search(r"(20\d{2})(\d{2})\d{2}", avatar_url)
|
||||
if match:
|
||||
site_time = int(f"{match.group(1)}{match.group(2)}")
|
||||
return avatar_url, site_time
|
||||
def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
|
||||
if not self.db or not phones:
|
||||
return set()
|
||||
|
||||
def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]:
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
resp = self.client.get_text(url, timeout=15, verify=False)
|
||||
status_code = resp.status_code
|
||||
text = resp.text
|
||||
if status_code == 403:
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||||
print(f" 403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
|
||||
self._refresh_session()
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
print(" 请求失败: 403 Forbidden")
|
||||
return None
|
||||
if status_code >= 400:
|
||||
raise RequestClientError(f"{status_code} Error")
|
||||
return text
|
||||
except RequestClientError as exc:
|
||||
print(f" 请求失败: {exc}")
|
||||
return None
|
||||
return None
|
||||
deduped = sorted({p for p in phones if p})
|
||||
if not deduped:
|
||||
return set()
|
||||
|
||||
def run(self):
|
||||
print("启动华律网采集...")
|
||||
if not self.areas:
|
||||
print("无城市数据")
|
||||
return
|
||||
existing: Set[str] = set()
|
||||
cur = self.db.db.cursor()
|
||||
try:
|
||||
chunk_size = 500
|
||||
for i in range(0, len(deduped), chunk_size):
|
||||
chunk = deduped[i:i + chunk_size]
|
||||
placeholders = ",".join(["%s"] * len(chunk))
|
||||
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||||
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
|
||||
for row in cur.fetchall():
|
||||
existing.add(row[0])
|
||||
finally:
|
||||
cur.close()
|
||||
|
||||
for city_code, city_info in self.areas.items():
|
||||
province_code = city_info.get("province_code")
|
||||
if not province_code:
|
||||
return existing
|
||||
|
||||
def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
|
||||
if not self.db:
|
||||
return 0, 0
|
||||
|
||||
rows: List[Dict[str, str]] = []
|
||||
for record in records:
|
||||
row = self._to_legacy_lawyer_row(record)
|
||||
if row:
|
||||
rows.append(row)
|
||||
if not rows:
|
||||
return 0, 0
|
||||
|
||||
existing = self._existing_phones_in_db([row["phone"] for row in rows])
|
||||
inserted = 0
|
||||
skipped = 0
|
||||
|
||||
for row in rows:
|
||||
phone = row.get("phone", "")
|
||||
if not phone or phone in existing:
|
||||
skipped += 1
|
||||
continue
|
||||
province_name = city_info.get("province", "")
|
||||
city_name = city_info.get("name", "")
|
||||
print(f"采集 {province_name}-{city_name}")
|
||||
try:
|
||||
self.db.insert_data("lawyer", row)
|
||||
existing.add(phone)
|
||||
inserted += 1
|
||||
except Exception as exc:
|
||||
skipped += 1
|
||||
print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
|
||||
|
||||
page = 1
|
||||
while True:
|
||||
payload = {"pid": province_code, "cid": city_code, "page": str(page)}
|
||||
data = self._post(payload)
|
||||
if not data or not data.get("lawyerList"):
|
||||
break
|
||||
return inserted, skipped
|
||||
|
||||
for item in data["lawyerList"]:
|
||||
result = self._parse_detail(item.get("lawyerUrl", ""), province_name, city_name)
|
||||
if not result:
|
||||
def crawl(
|
||||
self,
|
||||
output_path: str,
|
||||
max_cities: int = 0,
|
||||
city_filter: Optional[str] = None,
|
||||
) -> None:
|
||||
cities = self.discover_cities()
|
||||
print(f"[discover] 共发现城市 {len(cities)} 个")
|
||||
|
||||
if city_filter:
|
||||
key = city_filter.strip().lower()
|
||||
cities = [
|
||||
c for c in cities
|
||||
if key in c.city_name.lower() or key in str(c.city_id).lower()
|
||||
]
|
||||
print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
|
||||
|
||||
if max_cities > 0:
|
||||
cities = cities[:max_cities]
|
||||
print(f"[discover] 截断城市数 {len(cities)}")
|
||||
|
||||
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
||||
|
||||
seen_ids: Set[str] = set()
|
||||
if os.path.exists(output_path):
|
||||
with open(output_path, "r", encoding="utf-8") as old_file:
|
||||
for line in old_file:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
self.db.insert_data("lawyer", result)
|
||||
print(f" -> 新增: {result['name']} ({result['phone']})")
|
||||
except Exception as exc:
|
||||
print(f" 插入失败: {exc}")
|
||||
time.sleep(1)
|
||||
item = json.loads(line)
|
||||
except Exception:
|
||||
continue
|
||||
rid = item.get("record_id")
|
||||
if rid:
|
||||
seen_ids.add(rid)
|
||||
print(f"[resume] 已有记录 {len(seen_ids)} 条")
|
||||
|
||||
page_count = data.get("lawyerItems", {}).get("pageCount", page)
|
||||
if page >= page_count:
|
||||
break
|
||||
page += 1
|
||||
time.sleep(2)
|
||||
total_new_json = 0
|
||||
total_new_db = 0
|
||||
total_skip_db = 0
|
||||
|
||||
time.sleep(1)
|
||||
print("华律网采集完成")
|
||||
with open(output_path, "a", encoding="utf-8") as out:
|
||||
for idx, target in enumerate(cities, start=1):
|
||||
print(
|
||||
f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
|
||||
f"(pid={target.province_id}, cid={target.city_id})"
|
||||
)
|
||||
city_records = list(self.crawl_city(target))
|
||||
|
||||
city_new_json = 0
|
||||
for record in city_records:
|
||||
rid = record["record_id"]
|
||||
if rid in seen_ids:
|
||||
continue
|
||||
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||
seen_ids.add(rid)
|
||||
city_new_json += 1
|
||||
total_new_json += 1
|
||||
|
||||
city_new_db, city_skip_db = self._write_records_to_db(city_records)
|
||||
total_new_db += city_new_db
|
||||
total_skip_db += city_skip_db
|
||||
|
||||
print(
|
||||
f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
|
||||
f"DB新增{city_new_db}条, DB跳过{city_skip_db}条"
|
||||
)
|
||||
|
||||
print(
|
||||
f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
|
||||
f"DB跳过{total_skip_db}条, 输出: {output_path}"
|
||||
)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="华律网全新采集脚本(站点数据直采)")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
default="/www/wwwroot/lawyers/data/hualv_records_all.jsonl",
|
||||
help="输出 jsonl 文件路径",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-cities",
|
||||
type=int,
|
||||
default=0,
|
||||
help="最多采集多少个城市,0 表示不限",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-pages",
|
||||
type=int,
|
||||
default=9999,
|
||||
help="每个城市最多采集多少页",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--city-filter",
|
||||
default="",
|
||||
help="按城市名称或城市编码过滤,如 beijing / 110100",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sleep",
|
||||
type=float,
|
||||
default=0.15,
|
||||
help="详情页请求间隔秒数",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--direct",
|
||||
action="store_true",
|
||||
help="直连模式,不使用 proxy_settings.json 代理",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-db",
|
||||
action="store_true",
|
||||
help="只输出 JSONL,不写入数据库",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
if args.no_db:
|
||||
crawler = HualvCrawler(
|
||||
max_pages=args.max_pages,
|
||||
sleep_seconds=args.sleep,
|
||||
use_proxy=not args.direct,
|
||||
db_connection=None,
|
||||
)
|
||||
crawler.crawl(
|
||||
output_path=args.output,
|
||||
max_cities=args.max_cities,
|
||||
city_filter=args.city_filter or None,
|
||||
)
|
||||
return
|
||||
|
||||
with Db() as db:
|
||||
crawler = HualvCrawler(
|
||||
max_pages=args.max_pages,
|
||||
sleep_seconds=args.sleep,
|
||||
use_proxy=not args.direct,
|
||||
db_connection=db,
|
||||
)
|
||||
crawler.crawl(
|
||||
output_path=args.output,
|
||||
max_cities=args.max_cities,
|
||||
city_filter=args.city_filter or None,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
with Db() as db:
|
||||
spider = HualvSpider(db)
|
||||
spider.run()
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user