feat: enhance project configuration and improve data export functionality

- Updated `.gitignore` to streamline ignored files and added logging for common sites.
- Expanded `config.py` with new configurations for Weixin and Redis, and improved database connection settings.
- Refined `README.md` to clarify project structure and usage instructions.
- Enhanced `requirements.txt` with additional dependencies for MongoDB and Redis support.
- Refactored multiple spider scripts to utilize a session-based approach for HTTP requests, improving error handling and proxy management.
- Updated `export_lawyers_excel.py` to include a default timestamp for data exports.
This commit is contained in:
hello-dd-code
2026-03-18 10:02:25 +08:00
parent c2b77975c1
commit 38e7c284e8
14 changed files with 1665 additions and 3004 deletions
+185 -286
View File
@@ -1,14 +1,9 @@
import json
import os
import random
import re
import sys
import time
from typing import Dict, List, Optional, Set, Tuple
from urllib.parse import urljoin
import urllib3
from bs4 import BeautifulSoup
import random
from typing import Dict, Optional
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(current_dir)
@@ -18,144 +13,191 @@ if request_dir not in sys.path:
if project_root not in sys.path:
sys.path.append(project_root)
from Db import Db
from request.requests_client import (
RequestClientError,
RequestConnectTimeout,
RequestConnectionError,
RequestTimeout,
RequestsClient,
)
from utils.rate_limiter import wait_for_request
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import urllib3
from bs4 import BeautifulSoup
from request.proxy_config import get_proxies, report_proxy_status
# 禁用 SSL 警告
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from Db import Db
from utils.rate_limiter import wait_for_request
DOMAIN = "大律师"
SITE_BASE = "https://m.maxlaw.cn"
LIST_TEMPLATE = SITE_BASE + "/law/{pinyin}?page={page}"
PHONE_PATTERN = re.compile(r"1[3-9]\d{9}")
MAX_PAGES_PER_CITY = int(os.getenv("MAXLAW_MAX_PAGES", "0"))
PROXY_TESTED = False
LIST_TEMPLATE = "https://m.maxlaw.cn/law/{pinyin}?page={page}"
_PROXY_TESTED = False
class DlsSpider:
def __init__(self, db_connection):
self.db = db_connection
self.client = self._build_client()
self.session = self._build_session()
self.areas = self._load_areas()
def _build_client(self) -> RequestsClient:
client = RequestsClient(
headers={
"User-Agent": (
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
"Mobile/15E148 Safari/604.1"
),
"Host": "m.maxlaw.cn",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "close",
},
retry_total=3,
retry_backoff_factor=1,
retry_status_forcelist=(429, 500, 502, 503, 504),
retry_allowed_methods=("GET", "POST"),
def _build_session(self) -> requests.Session:
"""构建带重试机制的 session"""
report_proxy_status()
s = requests.Session()
s.trust_env = False
proxies = get_proxies()
if proxies:
s.proxies.update(proxies)
else:
s.proxies.clear()
self._proxy_test(s, proxies)
# 配置重试策略
retries = Retry(
total=3, # 总共重试3次
backoff_factor=1, # 重试间隔:1s, 2s, 4s
status_forcelist=(429, 500, 502, 503, 504), # 对这些状态码进行重试
allowed_methods=frozenset(["GET", "POST"]),
raise_on_status=False # 不立即抛出异常,让代码处理
)
self._proxy_test(client, client.proxies or None)
return client
adapter = HTTPAdapter(max_retries=retries)
s.mount("https://", adapter)
s.mount("http://", adapter)
s.headers.update({
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
"Host": "m.maxlaw.cn",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "close",
})
return s
def _refresh_client(self) -> None:
self.client.refresh()
self._proxy_test(self.client, self.client.proxies or None)
def _refresh_session(self) -> None:
try:
self.session.close()
except Exception:
pass
self.session = self._build_session()
def _proxy_test(self, client: RequestsClient, proxies: Optional[Dict[str, str]]) -> None:
global PROXY_TESTED
if PROXY_TESTED or not os.getenv("PROXY_TEST"):
def _proxy_test(self, session: requests.Session, proxies: Optional[Dict[str, str]]) -> None:
global _PROXY_TESTED
if _PROXY_TESTED or not os.getenv("PROXY_TEST"):
return
PROXY_TESTED = True
_PROXY_TESTED = True
if not proxies:
print("[proxy] test skipped: no proxy configured")
return
test_url = os.getenv("PROXY_TEST_URL", "https://dev.kdlapi.com/testproxy")
timeout = float(os.getenv("PROXY_TEST_TIMEOUT", "10"))
try:
resp = client.get_text(test_url, timeout=timeout, headers={"Connection": "close"})
resp = session.get(
test_url,
timeout=timeout,
headers={"Connection": "close"},
)
print(f"[proxy] test {resp.status_code}: {resp.text.strip()[:200]}")
except Exception as exc:
print(f"[proxy] test failed: {exc}")
def _load_areas(self) -> List[Dict[str, str]]:
tables = ("area_new", "area2", "area")
last_error = None
for table in tables:
try:
rows = self.db.select_data(table, "province, city, pinyin", "domain='maxlaw'") or []
except Exception as exc:
last_error = exc
continue
if rows:
missing_pinyin = sum(1 for row in rows if not (row.get("pinyin") or "").strip())
print(f"[大律师] 地区来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
return rows
if last_error:
print(f"[大律师] 加载地区失败: {last_error}")
print("[大律师] 无地区数据(已尝试 area_new/area2/area")
return []
def _load_areas(self):
try:
return self.db.select_data(
"area_new",
"province, city, pinyin",
"domain='maxlaw'"
) or []
except Exception as exc:
print(f"加载地区失败: {exc}")
return []
def _get(
self,
url: str,
*,
headers: Optional[Dict[str, str]] = None,
max_retries: int = 3,
timeout: Tuple[int, int] = (10, 30),
) -> Optional[str]:
def _get(self, url: str, max_retries: int = 3, headers: Optional[Dict[str, str]] = None) -> Optional[str]:
"""发送 GET 请求,带重试机制"""
wait_for_request()
for attempt in range(max_retries):
try:
resp = self.client.get_text(url, timeout=timeout, verify=False, headers=headers)
if resp.status_code == 403:
# 使用更长的超时时间,分别设置连接和读取超时
resp = self.session.get(
url,
timeout=(10, 30), # (connect_timeout, read_timeout)
verify=False,
headers=headers,
)
status_code = resp.status_code
content = resp.text
resp.close()
if status_code == 403:
if attempt < max_retries - 1:
wait_time = (2 ** attempt) + random.uniform(0.3, 1.0)
print(f"请求403{wait_time:.2f}s后重试 ({attempt + 1}/{max_retries}): {url}")
self._refresh_client()
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
print(f"403被拦截{wait_time}后重试 ({attempt + 1}/{max_retries}): {url}")
self._refresh_session()
time.sleep(wait_time)
continue
print(f"请求失败 {url}: 403 Forbidden")
return None
if resp.status_code >= 400:
raise RequestClientError(f"{resp.status_code} Error: {url}")
return resp.text
except RequestConnectTimeout as exc:
if status_code >= 400:
raise requests.exceptions.HTTPError(f"{status_code} Error: {url}")
return content
except requests.exceptions.ConnectTimeout as exc:
if attempt < max_retries - 1:
wait_time = 2 ** attempt # 指数退避:2s, 4s, 8s
print(f"连接超时,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
time.sleep(wait_time)
else:
print(f"连接超时,已达到最大重试次数 {url}: {exc}")
return None
except requests.exceptions.Timeout as exc:
if attempt < max_retries - 1:
wait_time = 2 ** attempt
print(f"连接超时,{wait_time}s后重试 ({attempt + 1}/{max_retries}): {url}")
print(f"请求超时,{wait_time}后重试 ({attempt + 1}/{max_retries}): {url}")
time.sleep(wait_time)
continue
print(f"连接超时,已达到最大重试次数 {url}: {exc}")
return None
except RequestTimeout as exc:
else:
print(f"请求超时,已达到最大重试次数 {url}: {exc}")
return None
except requests.exceptions.ConnectionError as exc:
if attempt < max_retries - 1:
wait_time = 2 ** attempt
print(f"请求超时{wait_time}s后重试 ({attempt + 1}/{max_retries}): {url}")
print(f"连接错误{wait_time}后重试 ({attempt + 1}/{max_retries}): {url}")
time.sleep(wait_time)
continue
print(f"请求超时,已达到最大重试次数 {url}: {exc}")
return None
except RequestConnectionError as exc:
if attempt < max_retries - 1:
wait_time = 2 ** attempt
print(f"连接错误,{wait_time}s后重试 ({attempt + 1}/{max_retries}): {url}")
time.sleep(wait_time)
continue
print(f"连接错误,已达到最大重试次数 {url}: {exc}")
return None
except RequestClientError as exc:
else:
print(f"连接错误,已达到最大重试次数 {url}: {exc}")
return None
except requests.exceptions.RequestException as exc:
print(f"请求失败 {url}: {exc}")
return None
return None
def _parse_list(self, html: str, province: str, city: str, list_url: str) -> int:
soup = BeautifulSoup(html, "html.parser")
cards = soup.find_all("div", class_="lstx")
if not cards:
return 0
inserted = 0
for card in cards:
link = card.find("a")
if not link or not link.get("href"):
continue
detail = self._parse_detail(link['href'], province, city, list_url)
if not detail:
continue
phone = detail.get("phone")
if not phone:
continue
condition = f"phone='{phone}' and domain='{DOMAIN}'"
if self.db.is_data_exist("lawyer", condition):
print(f" -- 已存在: {detail['name']} ({phone})")
time.sleep(0.3)
continue
try:
self.db.insert_data("lawyer", detail)
inserted += 1
print(f" -> 新增: {detail['name']} ({phone})")
except Exception as exc:
print(f" 插入失败: {exc}")
time.sleep(1)
time.sleep(0.3)
# 列表页结束后再缓一缓,降低风控
time.sleep(0.6)
return inserted
def _detail_headers(self, referer: str) -> Dict[str, str]:
return {
"Referer": referer,
@@ -166,215 +208,72 @@ class DlsSpider:
"Upgrade-Insecure-Requests": "1",
}
def _extract_detail_urls(self, html: str) -> List[str]:
soup = BeautifulSoup(html, "html.parser")
urls: List[str] = []
seen: Set[str] = set()
# 主选择器:当前站点列表卡片
for a_tag in soup.select("div.lstx a[href]"):
href = (a_tag.get("href") or "").strip()
if not href:
continue
url = urljoin(SITE_BASE, href)
if url in seen:
continue
seen.add(url)
urls.append(url)
# 回退选择器:页面结构轻微变化时尽量保活
if not urls:
for a_tag in soup.select("a[href]"):
href = (a_tag.get("href") or "").strip()
if "/lawyer/" not in href:
continue
url = urljoin(SITE_BASE, href)
if url in seen:
continue
seen.add(url)
urls.append(url)
return urls
def _extract_name(self, soup: BeautifulSoup) -> str:
for selector in ("h2.lawyerName", "h1.lawyerName", "h1", "h2"):
tag = soup.select_one(selector)
if tag:
name = tag.get_text(strip=True)
if name:
return name
title = soup.title.get_text(strip=True) if soup.title else ""
match = re.search(r"(\S+律师)", title)
return match.group(1) if match else ""
def _extract_law_firm(self, soup: BeautifulSoup) -> str:
for selector in ("p.law-firm", "div.law-firm", "p[class*=firm]"):
tag = soup.select_one(selector)
if tag:
text = tag.get_text(strip=True)
if text:
return text
page_text = soup.get_text(" ", strip=True)
match = re.search(r"(执业机构|律所)\s*[:]?\s*([^\s,。,;]{2,40})", page_text)
if match:
return match.group(2).strip()
return ""
def _normalize_phone(self, text: str) -> str:
compact = re.sub(r"\D", "", text or "")
match = PHONE_PATTERN.search(compact)
return match.group(0) if match else ""
def _extract_phone(self, soup: BeautifulSoup) -> str:
contact = soup.select_one("ul.contact-content")
if contact:
phone = self._normalize_phone(contact.get_text(" ", strip=True))
if phone:
return phone
for selector in ("a[href^='tel:']", "span.phone", "p.phone"):
tag = soup.select_one(selector)
if tag:
phone = self._normalize_phone(tag.get_text(" ", strip=True))
if phone:
return phone
return self._normalize_phone(soup.get_text(" ", strip=True))
def _parse_detail(self, detail_url: str, province: str, city: str, list_url: str) -> Optional[Dict[str, str]]:
print(f" 详情: {detail_url}")
html = self._get(detail_url, headers=self._detail_headers(list_url))
def _parse_detail(self, path: str, province: str, city: str, list_url: str) -> Optional[Dict[str, str]]:
url = f"https://m.maxlaw.cn{path}"
print(f" 详情: {url}")
html = self._get(url, headers=self._detail_headers(list_url))
if not html:
return None
soup = BeautifulSoup(html, "html.parser")
name = self._extract_name(soup)
phone = self._extract_phone(soup)
name_tag = soup.find("h2", class_="lawyerName")
law_firm_tag = soup.find("p", class_="law-firm")
contact_list = soup.find("ul", class_="contact-content")
name = name_tag.get_text(strip=True) if name_tag else ""
law_firm = law_firm_tag.get_text(strip=True) if law_firm_tag else ""
phone = ""
if contact_list:
items = contact_list.find_all("li")
if len(items) > 2:
phone_tag = items[2].find("p")
if phone_tag:
phone = phone_tag.get_text(strip=True)
phone = phone.split("咨询请说明来自大律师网")[0].strip()
phone = phone.replace('-', '').strip()
if not name or not phone:
print(" 信息不完整,跳过")
return None
safe_city = city or province
safe_city = city if city else province
return {
"name": name,
"law_firm": self._extract_law_firm(soup),
"law_firm": law_firm,
"province": province,
"city": safe_city,
"phone": phone,
"url": detail_url,
"url": url,
"domain": DOMAIN,
"create_time": int(time.time()),
"params": json.dumps({"province": province, "city": safe_city}, ensure_ascii=False),
"params": json.dumps({"province": province, "city": safe_city}, ensure_ascii=False)
}
def _existing_phones(self, phones: List[str]) -> Set[str]:
if not phones:
return set()
existing: Set[str] = set()
cur = self.db.db.cursor()
try:
chunk_size = 500
for idx in range(0, len(phones), chunk_size):
chunk = phones[idx:idx + chunk_size]
placeholders = ",".join(["%s"] * len(chunk))
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
cur.execute(sql, [DOMAIN, *chunk])
for row in cur.fetchall():
existing.add(row[0])
finally:
cur.close()
return existing
def _save_lawyers(self, lawyers: List[Dict[str, str]]) -> Tuple[int, int]:
if not lawyers:
return 0, 0
phones = [row["phone"] for row in lawyers if row.get("phone")]
existing = self._existing_phones(phones)
inserted = 0
skipped = 0
for row in lawyers:
phone = row.get("phone", "")
if not phone:
skipped += 1
continue
if phone in existing:
skipped += 1
print(f" -- 已存在: {row.get('name', '')} ({phone})")
continue
try:
self.db.insert_data("lawyer", row)
existing.add(phone)
inserted += 1
print(f" -> 新增: {row.get('name', '')} ({phone})")
except Exception as exc:
skipped += 1
print(f" 插入失败 {row.get('url', '')}: {exc}")
return inserted, skipped
def _crawl_city(self, area: Dict[str, str]) -> Tuple[int, int]:
pinyin = (area.get("pinyin") or "").strip()
province = area.get("province", "")
city = area.get("city", "")
if not pinyin:
return 0, 0
total_inserted = 0
total_parsed = 0
page = 1
prev_fingerprint = ""
while True:
if MAX_PAGES_PER_CITY > 0 and page > MAX_PAGES_PER_CITY:
print(f"达到分页上限({MAX_PAGES_PER_CITY}),停止 {province}-{city}")
break
list_url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
print(f"采集 {province}-{city}{page} 页: {list_url}")
html = self._get(list_url)
if not html:
break
detail_urls = self._extract_detail_urls(html)
if not detail_urls:
print(" 列表为空,结束当前城市")
break
fingerprint = "|".join(detail_urls[:8])
if fingerprint and fingerprint == prev_fingerprint:
print(" 列表页重复,提前停止当前城市")
break
prev_fingerprint = fingerprint
lawyers: List[Dict[str, str]] = []
for detail_url in detail_urls:
row = self._parse_detail(detail_url, province, city, list_url)
if row:
lawyers.append(row)
time.sleep(0.25)
inserted, skipped = self._save_lawyers(lawyers)
total_inserted += inserted
total_parsed += len(lawyers)
print(
f"{page} 页完成: 列表{len(detail_urls)}条, "
f"解析{len(lawyers)}条, 新增{inserted}条, 跳过{skipped}"
)
page += 1
time.sleep(0.5)
return total_inserted, total_parsed
def run(self):
print("启动大律师采集...")
if not self.areas:
print("无地区数据")
return
all_inserted = 0
all_parsed = 0
for area in self.areas:
inserted, parsed = self._crawl_city(area)
all_inserted += inserted
all_parsed += parsed
print(f"大律师采集完成: 解析{all_parsed}条, 新增{all_inserted}")
pinyin = area.get("pinyin")
province = area.get("province", "")
city = area.get("city", "")
if not pinyin:
continue
page = 1
while True:
list_url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
print(f"采集 {province}-{city}{page} 页: {list_url}")
html = self._get(list_url)
if not html:
break
inserted = self._parse_list(html, province, city, list_url)
if inserted == 0:
break
page += 1
print("大律师采集完成")
if __name__ == "__main__":
+11 -6
View File
@@ -19,6 +19,9 @@ if project_root not in sys.path:
from Db import Db
DEFAULT_EXPORT_START_TS = 1772932103
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="导出律师数据到 Excel")
parser.add_argument(
@@ -30,7 +33,10 @@ def parse_args() -> argparse.Namespace:
"--start-ts",
type=int,
default=None,
help="create_time 起始时间戳(含),不传时默认取最近7天",
help=(
"create_time 起始时间戳(含),"
f"不传时默认取 {DEFAULT_EXPORT_START_TS} 之后的数据"
),
)
parser.add_argument(
"--end-ts",
@@ -83,9 +89,9 @@ def parse_args() -> argparse.Namespace:
def apply_default_time_filter(args: argparse.Namespace) -> None:
# 未显式传时间范围时,默认导出最近7天的数据
# 未显式传时间范围时,默认导出指定时间戳之后的数据
if args.start_ts is None and args.end_ts is None:
args.start_ts = int(time.time()) - 7 * 24 * 3600
args.start_ts = DEFAULT_EXPORT_START_TS
args.end_ts = 0
return
if args.start_ts is None:
@@ -211,11 +217,10 @@ def export_to_excel(
ws = wb.active
ws.title = "lawyers"
headers = ["手机号", "姓名", "律所", "省份", "市区", "站点名称", "domain"]
headers = ["手机号", "姓名", "律所", "省份", "市区", "站点名称", "domain", "URL"]
if include_extra:
headers.extend(
[
"URL",
"站点",
"create_time",
"create_time_text",
@@ -270,12 +275,12 @@ def export_to_excel(
row.get("city", "") or "",
site_name,
row.get("domain", "") or "",
row.get("url", "") or "",
]
if include_extra:
line.extend(
[
row.get("url", "") or "",
row.get("domain", "") or "",
row.get("create_time", "") or "",
ts_to_text(row.get("create_time")),
+174 -429
View File
@@ -1,16 +1,9 @@
import argparse
import ast
import hashlib
import json
import os
import random
import re
import sys
import time
from dataclasses import dataclass
from typing import Dict, Iterable, List, Optional, Set, Tuple
import urllib3
import random
from typing import Dict, List, Set, Optional
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(current_dir)
@@ -20,460 +13,212 @@ if request_dir not in sys.path:
if project_root not in sys.path:
sys.path.append(project_root)
import requests
from request.proxy_config import get_proxies, report_proxy_status
from Db import Db
from request.requests_client import RequestClientError, RequestsClient
from utils.rate_limiter import wait_for_request
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
SITE_NAME = "findlaw"
LEGACY_DOMAIN = "找法网"
SITE_BASE = "https://m.findlaw.cn"
CITY_DATA_URL = "https://static.findlawimg.com/minify/?b=js&f=m/public/v1/areaDataTwo.js"
LIST_URL_TEMPLATE = SITE_BASE + "/{city_py}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"
PHONE_RE = re.compile(r"1[3-9]\d{9}")
DOMAIN = "找法网"
LIST_TEMPLATE = "https://m.findlaw.cn/{pinyin}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"
@dataclass
class CityTarget:
province_id: str
province_name: str
province_py: str
city_id: str
city_name: str
city_py: str
def normalize_phone(text: str) -> str:
compact = re.sub(r"\D", "", text or "")
match = PHONE_RE.search(compact)
return match.group(0) if match else ""
class FindlawCrawler:
def __init__(
self,
max_pages: int = 9999,
sleep_seconds: float = 0.1,
use_proxy: bool = True,
db_connection=None,
):
self.max_pages = max_pages
self.sleep_seconds = max(0.0, sleep_seconds)
class FindlawSpider:
def __init__(self, db_connection):
self.db = db_connection
self.client = RequestsClient(
headers={
"User-Agent": (
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
"Mobile/15E148 Safari/604.1"
),
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
"Connection": "close",
},
use_proxy=use_proxy,
retry_total=2,
retry_backoff_factor=1,
retry_status_forcelist=(429, 500, 502, 503, 504),
retry_allowed_methods=("GET",),
)
self.session = self._build_session()
self.cities = self._load_cities()
def _get_text(
self,
url: str,
timeout: int = 20,
max_retries: int = 3,
referer: str = SITE_BASE,
) -> str:
headers = {"Referer": referer}
last_error: Optional[Exception] = None
def _build_session(self) -> requests.Session:
report_proxy_status()
session = requests.Session()
session.trust_env = False
proxies = get_proxies()
if proxies:
session.proxies.update(proxies)
else:
session.proxies.clear()
session.headers.update({
"User-Agent": (
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
"Mobile/15E148 Safari/604.1"
),
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
"Connection": "close",
})
return session
for attempt in range(max_retries):
wait_for_request()
try:
resp = self.client.get_text(url, timeout=timeout, verify=False, headers=headers)
code = resp.status_code
if code == 403:
if attempt < max_retries - 1:
self.client.refresh()
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
continue
raise RequestClientError(f"{code} Error: {url}")
if code >= 500 and attempt < max_retries - 1:
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
continue
if code >= 400:
raise RequestClientError(f"{code} Error: {url}")
return resp.text
except Exception as exc:
last_error = exc
if attempt < max_retries - 1:
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
continue
raise
if last_error is not None:
raise last_error
raise RequestClientError(f"Unknown request error: {url}")
def _parse_city_js_array(self, script_text: str, var_name: str) -> List[Dict]:
pattern = rf"var\s+{re.escape(var_name)}\s*=\s*(\[[\s\S]*?\]);"
match = re.search(pattern, script_text)
if not match:
return []
raw = match.group(1)
def _refresh_session(self) -> None:
try:
rows = ast.literal_eval(raw)
return rows if isinstance(rows, list) else []
self.session.close()
except Exception:
return []
pass
self.session = self._build_session()
def discover_cities(self) -> List[CityTarget]:
js_text = self._get_text(CITY_DATA_URL, referer=SITE_BASE + "/findlawyers/")
provinces = self._parse_city_js_array(js_text, "iosProvinces")
cities = self._parse_city_js_array(js_text, "iosCitys")
province_map: Dict[str, Dict] = {}
for item in provinces:
pid = str(item.get("id") or "").strip()
if pid:
province_map[pid] = item
results: List[CityTarget] = []
seen_py: Set[str] = set()
for city in cities:
city_py = str(city.get("pinyin") or "").strip()
city_name = str(city.get("value") or "").strip()
city_id = str(city.get("id") or "").strip()
province_id = str(city.get("parentId") or "").strip()
if not city_py or not city_name or not city_id:
continue
if city_py in seen_py:
continue
seen_py.add(city_py)
province_row = province_map.get(province_id, {})
province_name = str(province_row.get("value") or city_name).strip()
province_py = str(province_row.get("pinyin") or city_py).strip()
results.append(
CityTarget(
province_id=province_id,
province_name=province_name,
province_py=province_py,
city_id=city_id,
city_name=city_name,
city_py=city_py,
)
)
return results
def _parse_list_payload(self, text: str) -> Dict:
cleaned = (text or "").strip().lstrip("\ufeff")
try:
return json.loads(cleaned)
except ValueError:
start = cleaned.find("{")
end = cleaned.rfind("}")
if start == -1 or end == -1:
return {}
return json.loads(cleaned[start:end + 1])
def fetch_list_page(self, city_py: str, page: int) -> Tuple[List[Dict], bool, str]:
list_url = LIST_URL_TEMPLATE.format(city_py=city_py, page=page)
referer = f"{SITE_BASE}/{city_py}/q_lawyer/"
text = self._get_text(list_url, referer=referer)
payload = self._parse_list_payload(text)
if payload.get("errcode") != 0:
return [], False, list_url
data = payload.get("data", {}) or {}
items = data.get("lawyer_list", []) or []
has_more = str(data.get("has_more", "0")) == "1"
return items, has_more, list_url
def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
for page in range(1, self.max_pages + 1):
def _get(self, url: str, referer: str, verify: bool = True, max_retries: int = 3) -> Optional[str]:
headers = {"Referer": referer}
for attempt in range(max_retries):
try:
items, has_more, list_url = self.fetch_list_page(target.city_py, page)
except Exception as exc:
print(f"[list] 失败 {target.city_py} p{page}: {exc}")
break
resp = self.session.get(url, timeout=15, verify=verify, headers=headers)
status_code = resp.status_code
text = resp.text
resp.close()
if status_code == 403:
if attempt < max_retries - 1:
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
self._refresh_session()
time.sleep(wait_time)
continue
print(f"请求失败 {url}: 403 Forbidden")
return None
if status_code >= 400:
raise requests.exceptions.HTTPError(f"{status_code} Error: {url}")
return text
except requests.exceptions.SSLError:
if verify:
return self._get(url, referer, verify=False, max_retries=max_retries)
print(f"SSL错误 {url}")
return None
except requests.exceptions.RequestException as exc:
print(f"请求失败 {url}: {exc}")
return None
return None
if not items:
break
for item in items:
detail_url = item.get("siteask_m") or item.get("site_url") or ""
detail_url = str(detail_url).strip()
if not detail_url.startswith("http"):
detail_url = list_url
phone = normalize_phone(item.get("mobile", ""))
profile = {
"uid": str(item.get("uid") or ""),
"name": str(item.get("username") or "").strip(),
"law_firm": str(item.get("lawyer_lawroom") or "").strip(),
"phone": phone,
"lawyer_year": item.get("lawyer_year"),
"service_area": str(item.get("service_area") or "").strip(),
"address": str(item.get("addr") or "").strip(),
"specialties": item.get("professionArr") or [],
"answer_count": item.get("ansnum"),
"comment_count": item.get("askcommentnum"),
}
now = int(time.time())
uid = profile.get("uid", "")
record_key = uid or detail_url
record_id = hashlib.md5(record_key.encode("utf-8")).hexdigest()
area = item.get("areaInfo", {}) or {}
yield {
"record_id": record_id,
"collected_at": now,
"source": {
"site": SITE_NAME,
"list_url": list_url,
"detail_url": detail_url,
"province": str(area.get("province") or target.province_name),
"province_py": target.province_py,
"city": str(area.get("city") or target.city_name),
"city_py": target.city_py,
"page": page,
},
"list_snapshot": {
"uid": uid,
"name": profile["name"],
"law_firm": profile["law_firm"],
"answer_count": profile["answer_count"],
"comment_count": profile["comment_count"],
},
"profile": profile,
"raw": item,
}
if self.sleep_seconds:
time.sleep(self.sleep_seconds)
if not has_more:
break
def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
source = record.get("source", {}) or {}
profile = record.get("profile", {}) or {}
phone = normalize_phone(profile.get("phone", ""))
if not phone:
return None
province = (source.get("province") or "").strip()
city = (source.get("city") or province).strip()
return {
"name": (profile.get("name") or "").strip(),
"law_firm": (profile.get("law_firm") or "").strip(),
"province": province,
"city": city,
"phone": phone,
"url": (source.get("detail_url") or source.get("list_url") or "").strip(),
"domain": LEGACY_DOMAIN,
"create_time": int(record.get("collected_at") or time.time()),
"params": json.dumps(record, ensure_ascii=False),
}
def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
if not self.db or not phones:
def _existing_phones(self, phones: List[str]) -> Set[str]:
if not phones:
return set()
deduped = sorted({p for p in phones if p})
if not deduped:
return set()
existing: Set[str] = set()
cur = self.db.db.cursor()
try:
chunk_size = 500
for i in range(0, len(deduped), chunk_size):
chunk = deduped[i:i + chunk_size]
for i in range(0, len(phones), chunk_size):
chunk = phones[i:i + chunk_size]
placeholders = ",".join(["%s"] * len(chunk))
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
cur.execute(sql, [DOMAIN, *chunk])
for row in cur.fetchall():
existing.add(row[0])
finally:
cur.close()
return existing
def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
if not self.db:
return 0, 0
rows: List[Dict[str, str]] = []
for record in records:
row = self._to_legacy_lawyer_row(record)
if row:
rows.append(row)
if not rows:
return 0, 0
existing = self._existing_phones_in_db([row["phone"] for row in rows])
inserted = 0
skipped = 0
for row in rows:
phone = row.get("phone", "")
if not phone or phone in existing:
skipped += 1
continue
def _load_cities(self):
condition = "domain='findlaw' AND level=2"
tables = ("area_new", "area2", "area")
last_error = None
for table in tables:
try:
self.db.insert_data("lawyer", row)
existing.add(phone)
inserted += 1
rows = self.db.select_data(table, "city, province, pinyin", condition) or []
except Exception as exc:
skipped += 1
print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
return inserted, skipped
last_error = exc
continue
if rows:
missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
print(f"[找法网] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
return rows
def crawl(
self,
output_path: str,
max_cities: int = 0,
city_filter: Optional[str] = None,
) -> None:
cities = self.discover_cities()
print(f"[discover] 共发现城市 {len(cities)}")
if city_filter:
key = city_filter.strip().lower()
cities = [c for c in cities if key in c.city_py.lower() or key in c.city_name.lower()]
print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
if max_cities > 0:
cities = cities[:max_cities]
print(f"[discover] 截断城市数 {len(cities)}")
if last_error:
print(f"[找法网] 加载地区数据失败: {last_error}")
print("[找法网] 无城市数据(已尝试 area_new/area2/area")
for table in tables:
try:
cnt = self.db.select_data(table, "COUNT(*) AS cnt", condition)
c = (cnt[0].get("cnt") if cnt else 0) if isinstance(cnt, (list, tuple)) else 0
print(f"[找法网] 校验: {table} 满足条件记录数: {c}")
except Exception:
pass
return []
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
def _fetch_page(self, url: str, referer: str) -> List[Dict]:
text = self._get(url, referer, verify=True)
if not text:
return []
seen_ids: Set[str] = set()
if os.path.exists(output_path):
with open(output_path, "r", encoding="utf-8") as old_file:
for line in old_file:
line = line.strip()
if not line:
try:
# 某些返回体前会携带 BOM 或包装脚本,此处做兼容
text = text.strip().lstrip("\ufeff")
try:
data = json.loads(text)
except ValueError:
json_start = text.find('{')
json_end = text.rfind('}')
if json_start == -1 or json_end == -1:
print(f"解析JSON失败 {url}: 返回内容开头: {text[:80]!r}")
return []
cleaned = text[json_start:json_end + 1]
data = json.loads(cleaned)
if isinstance(data, str):
try:
data = json.loads(data)
except ValueError:
print(f"解析JSON失败 {url}: 二次解析仍为字符串,开头: {str(data)[:80]!r}")
return []
except ValueError as exc:
print(f"解析JSON失败 {url}: {exc}")
return []
items = data.get("data", {}).get("lawyer_list", [])
parsed = []
for item in items:
phone = (item.get("mobile") or "").replace("-", "")
parsed.append({
"name": item.get("username", ""),
"law_firm": item.get("lawyer_lawroom", ""),
"province": item.get("areaInfo", {}).get("province", ""),
"city": item.get("areaInfo", {}).get("city", ""),
"phone": phone,
"url": url,
"domain": DOMAIN,
"create_time": int(time.time()),
"params": json.dumps(item, ensure_ascii=False)
})
return parsed
def run(self):
print("启动找法网采集...")
if not self.cities:
print("无城市数据")
return
for city in self.cities:
pinyin = city.get("pinyin")
province = city.get("province", "")
city_name = city.get("city", "")
if not pinyin:
continue
print(f"采集 {province}-{city_name}")
page = 1
while True:
url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
referer = f"https://m.findlaw.cn/{pinyin}/q_lawyer/"
print(f"{page} 页: {url}")
items = self._fetch_page(url, referer)
if not items:
break
phones = [it.get("phone") for it in items if (it.get("phone") or "").strip()]
existing = self._existing_phones(phones)
for entry in items:
phone = entry.get("phone")
if not phone:
continue
if phone in existing:
print(f" -- 已存在: {entry['name']} ({phone})")
continue
try:
item = json.loads(line)
except Exception:
continue
rid = item.get("record_id")
if rid:
seen_ids.add(rid)
print(f"[resume] 已有记录 {len(seen_ids)}")
self.db.insert_data("lawyer", entry)
print(f" -> 新增: {entry['name']} ({phone})")
except Exception as exc:
print(f" 插入失败: {exc}")
total_new_json = 0
total_new_db = 0
total_skip_db = 0
page += 1
with open(output_path, "a", encoding="utf-8") as out:
for idx, target in enumerate(cities, start=1):
print(
f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
f"({target.city_py})"
)
city_records = list(self.crawl_city(target))
city_new_json = 0
for record in city_records:
rid = record["record_id"]
if rid in seen_ids:
continue
out.write(json.dumps(record, ensure_ascii=False) + "\n")
seen_ids.add(rid)
city_new_json += 1
total_new_json += 1
city_new_db, city_skip_db = self._write_records_to_db(city_records)
total_new_db += city_new_db
total_skip_db += city_skip_db
print(
f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
f"DB新增{city_new_db}条, DB跳过{city_skip_db}"
)
print(
f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
f"DB跳过{total_skip_db}条, 输出: {output_path}"
)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="找法网全新采集脚本(重写版)")
parser.add_argument(
"--output",
default="/www/wwwroot/lawyers/data/findlaw_records_all.jsonl",
help="输出 jsonl 文件路径",
)
parser.add_argument(
"--max-cities",
type=int,
default=0,
help="最多采集多少个城市,0 表示不限",
)
parser.add_argument(
"--max-pages",
type=int,
default=9999,
help="每个城市最多采集多少页",
)
parser.add_argument(
"--city-filter",
default="",
help="按城市拼音或城市名过滤,如 beijing",
)
parser.add_argument(
"--sleep",
type=float,
default=0.1,
help="每条记录采集间隔秒数",
)
parser.add_argument(
"--direct",
action="store_true",
help="直连模式,不使用 proxy_settings.json 代理",
)
parser.add_argument(
"--no-db",
action="store_true",
help="只输出 JSONL,不写入数据库",
)
return parser.parse_args()
def main():
args = parse_args()
if args.no_db:
crawler = FindlawCrawler(
max_pages=args.max_pages,
sleep_seconds=args.sleep,
use_proxy=not args.direct,
db_connection=None,
)
crawler.crawl(
output_path=args.output,
max_cities=args.max_cities,
city_filter=args.city_filter or None,
)
return
with Db() as db:
crawler = FindlawCrawler(
max_pages=args.max_pages,
sleep_seconds=args.sleep,
use_proxy=not args.direct,
db_connection=db,
)
crawler.crawl(
output_path=args.output,
max_cities=args.max_cities,
city_filter=args.city_filter or None,
)
print("找法网采集完成")
if __name__ == "__main__":
main()
with Db() as db:
spider = FindlawSpider(db)
spider.run()
+288 -788
View File
File diff suppressed because it is too large Load Diff
+236 -586
View File
@@ -1,16 +1,13 @@
import argparse
import hashlib
import json
import os
import random
import re
import sys
import time
from dataclasses import dataclass, field
from typing import Dict, Iterable, List, Optional, Set, Tuple
import urllib3
from bs4 import BeautifulSoup
import random
from typing import Dict, Optional, List, Set
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(current_dir)
@@ -20,628 +17,281 @@ if request_dir not in sys.path:
if project_root not in sys.path:
sys.path.append(project_root)
from Db import Db
from request.requests_client import RequestClientError, RequestsClient
from utils.rate_limiter import wait_for_request
import requests
import urllib3
from bs4 import BeautifulSoup
from request.proxy_config import get_proxies, report_proxy_status
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
SITE_NAME = "lawtime"
LEGACY_DOMAIN = "法律快车"
SITE_BASE = "https://www.lawtime.cn"
PROVINCE_API = "https://www.lawtime.cn/public/stationIndex/getAreaList?areacode=0"
CITY_API_TEMPLATE = "https://www.lawtime.cn/public/stationIndex/getAreaList?areacode={province_id}"
LIST_URL_TEMPLATE = SITE_BASE + "/{city_py}/lawyer"
from Db import Db
from config import LAWTIME_CONFIG
PHONE_RE = re.compile(r"1[3-9]\d{9}")
YEAR_RE = re.compile(r"执业\s*(\d+)\s*年")
LIST_BASE = "https://m.lawtime.cn/{pinyin}/lawyer/?page={page}"
DETAIL_BASE = "https://m.lawtime.cn"
DOMAIN = "法律快车"
@dataclass
class CityTarget:
province_id: str
province_name: str
province_py: str
city_id: str
city_name: str
city_py: str
@dataclass
class ListCard:
detail_url: str
name: str
phone: str
address: str = ""
specialties: List[str] = field(default_factory=list)
metric_text: str = ""
def normalize_phone(text: str) -> str:
compact = re.sub(r"\D", "", text or "")
match = PHONE_RE.search(compact)
return match.group(0) if match else ""
class LawtimeCrawler:
def __init__(
self,
max_pages: int = 9999,
sleep_seconds: float = 0.1,
use_proxy: bool = True,
db_connection=None,
):
self.max_pages = max_pages
self.sleep_seconds = max(0.0, sleep_seconds)
class LawtimeSpider:
def __init__(self, db_connection):
self.db = db_connection
self.client = RequestsClient(
headers={
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/122.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/json,*/*;q=0.8",
"Connection": "close",
},
use_proxy=use_proxy,
retry_total=2,
retry_backoff_factor=1,
retry_status_forcelist=(429, 500, 502, 503, 504),
retry_allowed_methods=("GET",),
)
self.session = self._build_session()
self.max_workers = int(os.getenv("SPIDER_WORKERS", "8"))
self._tls = threading.local()
def _get_text(
self,
url: str,
*,
timeout: int = 20,
max_retries: int = 3,
referer: str = SITE_BASE,
) -> str:
headers = {"Referer": referer}
last_error: Optional[Exception] = None
def _build_session(self) -> requests.Session:
report_proxy_status()
session = requests.Session()
session.trust_env = False
proxies = get_proxies()
if proxies:
session.proxies.update(proxies)
else:
session.proxies.clear()
headers = LAWTIME_CONFIG.get("HEADERS", {})
if headers:
session.headers.update(headers)
session.headers.setdefault("Connection", "close")
return session
for attempt in range(max_retries):
wait_for_request()
try:
resp = self.client.get_text(
url,
timeout=timeout,
verify=False,
headers=headers,
)
code = resp.status_code
if code == 403:
if attempt < max_retries - 1:
self.client.refresh()
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
continue
raise RequestClientError(f"{code} Error: {url}")
if code >= 500 and attempt < max_retries - 1:
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
continue
if code >= 400:
raise RequestClientError(f"{code} Error: {url}")
return resp.text
except Exception as exc:
last_error = exc
if attempt < max_retries - 1:
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
continue
raise
if last_error is not None:
raise last_error
raise RequestClientError(f"Unknown request error: {url}")
def _get_json(self, url: str, *, referer: str) -> List[Dict]:
text = self._get_text(url, referer=referer)
cleaned = (text or "").strip().lstrip("\ufeff")
if not cleaned or cleaned.startswith("<"):
return []
def _refresh_session(self) -> None:
try:
data = json.loads(cleaned)
except ValueError:
return []
return data if isinstance(data, list) else []
self.session.close()
except Exception:
pass
self.session = self._build_session()
def discover_cities(self) -> List[CityTarget]:
provinces = self._get_json(PROVINCE_API, referer=SITE_BASE)
if not provinces:
print("[discover] 地区接口未返回有效数据")
return []
def _get_thread_session(self) -> requests.Session:
s = getattr(self._tls, "session", None)
if s is not None:
return s
s = self._build_session()
s.headers.update(dict(self.session.headers))
self._tls.session = s
return s
results: List[CityTarget] = []
seen_py: Set[str] = set()
for province in provinces:
province_id = str(province.get("id") or "").strip()
province_name = str(province.get("province") or province.get("city") or "").strip()
province_py = str(province.get("pinyin") or "").strip()
if not province_id or not province_name:
continue
city_api = CITY_API_TEMPLATE.format(province_id=province_id)
def _refresh_thread_session(self) -> None:
s = getattr(self._tls, "session", None)
if s is not None:
try:
cities = self._get_json(city_api, referer=LIST_URL_TEMPLATE.format(city_py=province_py or ""))
except Exception as exc:
print(f"[city] 获取失败 province={province_id}: {exc}")
continue
if not cities:
cities = [
{
"id": province_id,
"province": province_name,
"city": province_name,
"pinyin": province_py,
}
]
for city in cities:
city_id = str(city.get("id") or "").strip()
city_name = str(city.get("city") or city.get("province") or "").strip()
city_py = str(city.get("pinyin") or "").strip()
if not city_id or not city_name or not city_py:
continue
if city_py in seen_py:
continue
seen_py.add(city_py)
results.append(
CityTarget(
province_id=province_id,
province_name=province_name,
province_py=province_py,
city_id=city_id,
city_name=city_name,
city_py=city_py,
)
)
return results
def _build_list_url(self, city_py: str, page: int) -> str:
base = LIST_URL_TEMPLATE.format(city_py=city_py)
if page <= 1:
return base
return f"{base}?page={page}"
def fetch_list_page(self, target: CityTarget, page: int) -> Tuple[List[ListCard], bool, str]:
list_url = self._build_list_url(target.city_py, page)
html = self._get_text(list_url, referer=SITE_BASE + "/")
cards = self.parse_list_cards(html)
soup = BeautifulSoup(html, "html.parser")
next_link = soup.select_one(f"div.page a[href*='page={page + 1}']")
has_next = next_link is not None
return cards, has_next, list_url
def parse_list_cards(self, html: str) -> List[ListCard]:
soup = BeautifulSoup(html, "html.parser")
cards: List[ListCard] = []
seen: Set[str] = set()
for item in soup.select("li.lawyer-item-card"):
link_tag = item.select_one("a.name[href]") or item.select_one("a.lawyer-img-box[href]")
if not link_tag:
continue
detail_url = (link_tag.get("href") or "").strip()
if not detail_url.startswith("http"):
continue
if detail_url in seen:
continue
seen.add(detail_url)
name = link_tag.get_text(strip=True)
phone = ""
phone_tag = item.select_one("div.phone")
if phone_tag:
phone = normalize_phone(phone_tag.get_text(" ", strip=True))
address = ""
addr_tag = item.select_one("div.location .txt")
if addr_tag:
address = addr_tag.get_text(" ", strip=True)
specialties: List[str] = []
prof_tag = item.select_one("div.prof .txt")
if prof_tag:
specialties = [
x.strip() for x in re.split(r"[、,]", prof_tag.get_text(" ", strip=True)) if x.strip()
]
metric_text = ""
metric_tag = item.select_one("div.num-msg")
if metric_tag:
metric_text = metric_tag.get_text(" ", strip=True)
cards.append(
ListCard(
detail_url=detail_url,
name=name,
phone=phone,
address=address,
specialties=specialties,
metric_text=metric_text,
)
)
return cards
def parse_detail(self, detail_url: str) -> Dict:
html = self._get_text(detail_url, referer=SITE_BASE)
if "网站防火墙" in html or "访问被拒绝" in html or "/ipfilter/verify" in html:
raise RequestClientError(f"firewall blocked: {detail_url}")
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text(" ", strip=True)
name = ""
law_firm = ""
phone = ""
address = ""
practice_years: Optional[int] = None
specialties: List[str] = []
if soup.title:
title = soup.title.get_text(" ", strip=True)
match = re.search(r"([^\s_,。]+?)律师", title)
if match:
name = match.group(1).strip()
phone_candidates = [
soup.select_one(".data-w .tel-b b").get_text(" ", strip=True)
if soup.select_one(".data-w .tel-b b")
else "",
soup.select_one(".law-info-b .item .two-r.b").get_text(" ", strip=True)
if soup.select_one(".law-info-b .item .two-r.b")
else "",
text,
]
for candidate in phone_candidates:
phone = normalize_phone(candidate)
if phone:
break
law_firm_tag = soup.select_one(".law-info-b .item .two-nowrap")
if law_firm_tag:
law_firm = law_firm_tag.get_text(" ", strip=True)
for li in soup.select(".law-info-b .item"):
li_text = li.get_text(" ", strip=True)
if ("事务所" in li_text or "法律服务所" in li_text) and not law_firm:
law_firm = li_text
addr_tag = soup.select_one(".law-info-b .item .two-r[title]")
if addr_tag:
addr_value = (addr_tag.get("title") or "").strip()
if len(addr_value) > 8:
address = addr_value
if not address:
addr_tag = soup.select_one(".law-info-b .item .two-r")
if addr_tag:
addr_value = addr_tag.get_text(" ", strip=True)
if len(addr_value) > 8 and "律师" not in addr_value:
address = addr_value
year_match = YEAR_RE.search(text)
if year_match:
try:
practice_years = int(year_match.group(1))
s.close()
except Exception:
practice_years = None
pass
self._tls.session = None
specialties = [x.get_text(strip=True) for x in soup.select(".profession-b .item") if x.get_text(strip=True)]
return {
"name": name,
"law_firm": law_firm,
"phone": phone,
"address": address,
"practice_years": practice_years,
"specialties": specialties,
"detail_url": detail_url,
}
def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
seen_details: Set[str] = set()
for page in range(1, self.max_pages + 1):
try:
cards, has_next, list_url = self.fetch_list_page(target, page)
except Exception as exc:
print(f"[list] 失败 {target.city_py} p{page}: {exc}")
break
if not cards:
break
for card in cards:
if card.detail_url in seen_details:
continue
seen_details.add(card.detail_url)
detail: Dict = {}
try:
detail = self.parse_detail(card.detail_url)
except Exception as exc:
print(f"[detail] 失败 {card.detail_url}: {exc}")
phone = normalize_phone(detail.get("phone") or card.phone)
profile_name = (detail.get("name") or card.name).replace("律师", "").strip()
now = int(time.time())
record_id = hashlib.md5(card.detail_url.encode("utf-8")).hexdigest()
yield {
"record_id": record_id,
"collected_at": now,
"source": {
"site": SITE_NAME,
"province_id": target.province_id,
"province": target.province_name,
"province_py": target.province_py,
"city_id": target.city_id,
"city": target.city_name,
"city_py": target.city_py,
"page": page,
"list_url": list_url,
"detail_url": card.detail_url,
},
"list_snapshot": {
"name": card.name,
"phone": card.phone,
"address": card.address,
"specialties": card.specialties,
"metric_text": card.metric_text,
},
"profile": {
"name": profile_name,
"law_firm": (detail.get("law_firm") or "").strip(),
"phone": phone,
"address": (detail.get("address") or card.address or "").strip(),
"practice_years": detail.get("practice_years"),
"specialties": detail.get("specialties") or card.specialties,
},
}
if self.sleep_seconds:
time.sleep(self.sleep_seconds)
if not has_next:
break
def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
source = record.get("source", {}) or {}
profile = record.get("profile", {}) or {}
phone = normalize_phone(profile.get("phone", ""))
if not phone:
return None
province = (source.get("province") or "").strip()
city = (source.get("city") or province).strip()
return {
"name": (profile.get("name") or "").strip(),
"law_firm": (profile.get("law_firm") or "").strip(),
"province": province,
"city": city,
"phone": phone,
"url": (source.get("detail_url") or source.get("list_url") or "").strip(),
"domain": LEGACY_DOMAIN,
"create_time": int(record.get("collected_at") or time.time()),
"params": json.dumps(record, ensure_ascii=False),
}
def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
if not self.db or not phones:
def _existing_phones(self, phones: List[str]) -> Set[str]:
if not phones:
return set()
deduped = sorted({p for p in phones if p})
if not deduped:
return set()
existing: Set[str] = set()
cur = self.db.db.cursor()
try:
chunk_size = 500
for i in range(0, len(deduped), chunk_size):
chunk = deduped[i:i + chunk_size]
for i in range(0, len(phones), chunk_size):
chunk = phones[i:i + chunk_size]
placeholders = ",".join(["%s"] * len(chunk))
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
cur.execute(sql, [DOMAIN, *chunk])
for row in cur.fetchall():
existing.add(row[0])
finally:
cur.close()
return existing
def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
if not self.db:
return 0, 0
def _load_areas(self):
condition = "level = 2 and domain='法律快车'"
tables = ("area_new", "area", "area2")
last_error = None
for table in tables:
try:
rows = self.db.select_data(table, "pinyin, province, city", condition) or []
except Exception as exc:
last_error = exc
continue
if rows:
missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
print(f"[法律快车] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
return rows
rows: List[Dict[str, str]] = []
for record in records:
row = self._to_legacy_lawyer_row(record)
if row:
rows.append(row)
if not rows:
return 0, 0
if last_error:
print(f"[法律快车] 加载地区数据失败: {last_error}")
print("[法律快车] 无城市数据(已尝试 area_new/area/area2")
return []
existing = self._existing_phones_in_db([row["phone"] for row in rows])
inserted = 0
skipped = 0
def _get(self, url: str, max_retries: int = 3) -> Optional[str]:
return self._get_with_session(self.session, url, max_retries=max_retries, is_thread=False)
for row in rows:
phone = row.get("phone", "")
if not phone or phone in existing:
skipped += 1
def _get_with_session(self, session: requests.Session, url: str, max_retries: int = 3, is_thread: bool = False) -> Optional[str]:
for attempt in range(max_retries):
try:
resp = session.get(url, timeout=15, verify=False)
status_code = resp.status_code
text = resp.text
resp.close()
if status_code == 403:
if attempt < max_retries - 1:
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
print(f"请求失败 {url}: 403{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
if is_thread:
self._refresh_thread_session()
session = self._get_thread_session()
else:
self._refresh_session()
session = self.session
time.sleep(wait_time)
continue
print(f"请求失败 {url}: 403 Forbidden")
return None
if status_code >= 400:
raise requests.exceptions.HTTPError(f"{status_code} Error: {url}")
return text
except requests.exceptions.RequestException as exc:
print(f"请求失败 {url}: {exc}")
return None
return None
def _parse_list(self, html: str, province: str, city: str) -> int:
soup = BeautifulSoup(html, "html.parser")
links = [a.get("href", "") for a in soup.select("a.hide_link")]
links = [link.replace("lll", "int") for link in links if link]
if not links:
return 0
detail_urls = [urljoin(DETAIL_BASE, link) for link in links]
results: List[Dict[str, str]] = []
with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls]
for fut in as_completed(futs):
try:
data = fut.result()
except Exception as exc:
print(f" 详情解析异常: {exc}")
continue
if data and data.get("phone"):
results.append(data)
if not results:
return len(detail_urls)
phones = [d["phone"] for d in results if d.get("phone")]
existing = self._existing_phones(phones)
for data in results:
phone = data.get("phone")
if not phone:
continue
if phone in existing:
print(f" -- 已存在: {data['name']} ({phone})")
continue
try:
self.db.insert_data("lawyer", row)
existing.add(phone)
inserted += 1
self.db.insert_data("lawyer", data)
print(f" -> 新增: {data['name']} ({phone})")
except Exception as exc:
skipped += 1
print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
print(f" 插入失败 {data.get('url')}: {exc}")
return inserted, skipped
return len(detail_urls)
def crawl(
self,
output_path: str,
max_cities: int = 0,
city_filter: Optional[str] = None,
) -> None:
cities = self.discover_cities()
print(f"[discover] 共发现城市 {len(cities)}")
def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
html = None
sess = self._get_thread_session()
html = self._get_with_session(sess, url, max_retries=3, is_thread=True)
if not html:
return None
if city_filter:
key = city_filter.strip().lower()
cities = [
c for c in cities
if key in c.city_py.lower() or key in c.city_name.lower()
]
print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text(" ")
if max_cities > 0:
cities = cities[:max_cities]
print(f"[discover] 截断城市数 {len(cities)}")
name = ""
title_tag = soup.find("title")
if title_tag:
match = re.search(r"(\S+)律师", title_tag.get_text())
if match:
name = match.group(1)
if not name:
intl_div = soup.find("div", class_="intl")
if intl_div:
match = re.search(r"(\S+)律师", intl_div.get_text())
if match:
name = match.group(1)
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
phone = ""
phone_pattern = r"1[3-9]\d{9}"
for item in soup.select("div.item.flex"):
label = item.find("div", class_="label")
desc = item.find("div", class_="desc")
if not label or not desc:
continue
label_text = label.get_text()
desc_text = desc.get_text().replace("-", "")
if "联系电话" in label_text or "电话" in label_text:
matches = re.findall(phone_pattern, desc_text)
if matches:
phone = matches[0]
break
if not phone:
matches = re.findall(phone_pattern, text.replace("-", ""))
if matches:
phone = matches[0]
if not phone:
print(f" 无手机号: {url}")
return None
seen_ids: Set[str] = set()
if os.path.exists(output_path):
with open(output_path, "r", encoding="utf-8") as old_file:
for line in old_file:
line = line.strip()
if not line:
continue
try:
item = json.loads(line)
except Exception:
continue
rid = item.get("record_id")
if rid:
seen_ids.add(rid)
print(f"[resume] 已有记录 {len(seen_ids)}")
law_firm = ""
for item in soup.select("div.item.flex"):
label = item.find("div", class_="label")
desc = item.find("div", class_="desc")
if not label or not desc:
continue
if "执业律所" in label.get_text() or "律所" in label.get_text():
law_firm = desc.get_text(strip=True).replace("已认证", "")
break
total_new_json = 0
total_new_db = 0
total_skip_db = 0
params = {
"list_url": url,
"province": province,
"city": city,
}
with open(output_path, "a", encoding="utf-8") as out:
for idx, target in enumerate(cities, start=1):
print(
f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
f"({target.city_py})"
)
city_records = list(self.crawl_city(target))
return {
"name": name or "",
"law_firm": law_firm,
"province": province,
"city": city,
"phone": phone,
"url": url,
"domain": DOMAIN,
"create_time": int(time.time()),
"params": json.dumps(params, ensure_ascii=False)
}
city_new_json = 0
for record in city_records:
rid = record["record_id"]
if rid in seen_ids:
continue
out.write(json.dumps(record, ensure_ascii=False) + "\n")
seen_ids.add(rid)
city_new_json += 1
total_new_json += 1
def run(self):
print("启动法律快车采集...")
areas = self._load_areas()
if not areas:
print("无地区数据")
return
city_new_db, city_skip_db = self._write_records_to_db(city_records)
total_new_db += city_new_db
total_skip_db += city_skip_db
print(
f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
f"DB新增{city_new_db}条, DB跳过{city_skip_db}"
)
print(
f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
f"DB跳过{total_skip_db}条, 输出: {output_path}"
)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="法律快车全新采集脚本(站点数据直采)")
parser.add_argument(
"--output",
default="/www/wwwroot/lawyers/data/lawtime_records_all.jsonl",
help="输出 jsonl 文件路径",
)
parser.add_argument(
"--max-cities",
type=int,
default=0,
help="最多采集多少个城市,0 表示不限",
)
parser.add_argument(
"--max-pages",
type=int,
default=9999,
help="每个城市最多采集多少页",
)
parser.add_argument(
"--city-filter",
default="",
help="按城市拼音或城市名过滤,如 beijing",
)
parser.add_argument(
"--sleep",
type=float,
default=0.1,
help="详情页请求间隔秒数",
)
parser.add_argument(
"--direct",
action="store_true",
help="直连模式,不使用 proxy_settings.json 代理",
)
parser.add_argument(
"--no-db",
action="store_true",
help="只输出 JSONL,不写入数据库",
)
return parser.parse_args()
def main():
args = parse_args()
if args.no_db:
crawler = LawtimeCrawler(
max_pages=args.max_pages,
sleep_seconds=args.sleep,
use_proxy=not args.direct,
db_connection=None,
)
crawler.crawl(
output_path=args.output,
max_cities=args.max_cities,
city_filter=args.city_filter or None,
)
return
with Db() as db:
crawler = LawtimeCrawler(
max_pages=args.max_pages,
sleep_seconds=args.sleep,
use_proxy=not args.direct,
db_connection=db,
)
crawler.crawl(
output_path=args.output,
max_cities=args.max_cities,
city_filter=args.city_filter or None,
)
for area in areas:
pinyin = area.get("pinyin")
province = area.get("province", "")
city = area.get("city", "")
if not pinyin:
continue
page = 1
while True:
list_url = LIST_BASE.format(pinyin=pinyin, page=page)
print(f"采集 {province}-{city}{page} 页: {list_url}")
html = self._get(list_url)
if not html:
break
link_count = self._parse_list(html, province, city)
if link_count == 0:
break
page += 1
print("法律快车采集完成")
if __name__ == "__main__":
main()
with Db() as db:
spider = LawtimeSpider(db)
spider.run()
+264 -608
View File
@@ -1,17 +1,11 @@
import argparse
import hashlib
import json
import os
import random
import re
import sys
import time
from dataclasses import dataclass
from typing import Dict, Iterable, List, Optional, Set, Tuple
from urllib.parse import urljoin
import urllib3
from bs4 import BeautifulSoup
import random
from typing import Dict, Optional, List, Set
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(current_dir)
@@ -21,237 +15,165 @@ if request_dir not in sys.path:
if project_root not in sys.path:
sys.path.append(project_root)
from Db import Db
from request.requests_client import RequestClientError, RequestsClient
from utils.rate_limiter import wait_for_request
import requests
import urllib3
from bs4 import BeautifulSoup
from request.proxy_config import get_proxies, report_proxy_status
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
SITE_NAME = "64365"
LEGACY_DOMAIN = "律图"
SITE_BASE = "https://m.64365.com"
AREA_DATA_URL = "https://image.64365.com/ui_v3/m/js/public/area-cate-data.js"
LIST_API_URL = "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/"
from Db import Db
PHONE_RE = re.compile(r"1[3-9]\d{9}")
YEAR_RE = re.compile(r"(\d+)\s*年")
DOMAIN = "律图"
LIST_URL = "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/"
@dataclass
class CityTarget:
area_id: str
province_id: str
province_name: str
province_py: str
city_name: str
city_py: str
@dataclass
class ListCard:
detail_url: str
name: str
specialties: List[str]
score_text: str
service_text: str
def normalize_phone(text: str) -> str:
compact = re.sub(r"\D", "", text or "")
match = PHONE_RE.search(compact)
return match.group(0) if match else ""
class Six4365Crawler:
def __init__(
self,
max_pages: int = 9999,
sleep_seconds: float = 0.1,
use_proxy: bool = True,
db_connection=None,
):
self.max_pages = max_pages
self.sleep_seconds = max(0.0, sleep_seconds)
class Six4365Spider:
def __init__(self, db_connection):
self.db = db_connection
self.client = RequestsClient(
headers={
"User-Agent": (
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
"Mobile/15E148 Safari/604.1"
),
"Accept": "text/html, */*; q=0.01",
"Connection": "close",
},
use_proxy=use_proxy,
retry_total=2,
retry_backoff_factor=1,
retry_status_forcelist=(429, 500, 502, 503, 504),
retry_allowed_methods=("GET", "POST"),
)
self.session = self._build_session()
self.max_workers = int(os.getenv("SPIDER_WORKERS", "8"))
self._tls = threading.local()
self.cities = self._load_cities()
def _request_text(
self,
method: str,
url: str,
*,
timeout: int = 20,
max_retries: int = 3,
referer: str = SITE_BASE,
data: Optional[Dict] = None,
) -> str:
headers = {"Referer": referer}
last_error: Optional[Exception] = None
def _build_session(self) -> requests.Session:
report_proxy_status()
session = requests.Session()
session.trust_env = False
proxies = get_proxies()
if proxies:
session.proxies.update(proxies)
else:
session.proxies.clear()
session.headers.update({
"User-Agent": (
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
"Mobile/15E148 Safari/604.1"
),
"Connection": "close",
})
return session
for attempt in range(max_retries):
wait_for_request()
def _refresh_session(self) -> None:
try:
self.session.close()
except Exception:
pass
self.session = self._build_session()
def _get_thread_session(self) -> requests.Session:
"""requests.Session 不是严格线程安全:每个线程用独立 session(但共享同样代理/headers"""
s = getattr(self._tls, "session", None)
if s is not None:
return s
s = self._build_session()
s.headers.update(dict(self.session.headers))
self._tls.session = s
return s
def _refresh_thread_session(self) -> None:
s = getattr(self._tls, "session", None)
if s is not None:
try:
if method.upper() == "POST":
resp = self.client.post_text(
url,
timeout=timeout,
verify=False,
headers=headers,
data=data,
)
else:
resp = self.client.get_text(
url,
timeout=timeout,
verify=False,
headers=headers,
)
s.close()
except Exception:
pass
self._tls.session = None
code = resp.status_code
if code == 403:
if attempt < max_retries - 1:
self.client.refresh()
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
continue
raise RequestClientError(f"{code} Error: {url}")
if code >= 500 and attempt < max_retries - 1:
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
continue
if code >= 400:
raise RequestClientError(f"{code} Error: {url}")
return resp.text
def _existing_urls(self, urls: List[str]) -> Set[str]:
"""批量查重,减少 N 次 is_data_exist"""
if not urls:
return set()
existing: Set[str] = set()
cur = self.db.db.cursor()
try:
# IN 参数过多会失败,分批
chunk_size = 500
for i in range(0, len(urls), chunk_size):
chunk = urls[i:i + chunk_size]
placeholders = ",".join(["%s"] * len(chunk))
sql = f"SELECT url FROM lawyer WHERE url IN ({placeholders})"
cur.execute(sql, chunk)
for row in cur.fetchall():
# pymysql 默认返回 tuple
existing.add(row[0])
finally:
cur.close()
return existing
def _load_cities(self):
tables = ("area_new", "area2", "area")
last_error = None
for table in tables:
try:
provinces = self.db.select_data(
table,
"id, code, province",
"domain='64365' AND level=1"
) or []
cities = self.db.select_data(
table,
"code, city, province, pid",
"domain='64365' AND level=2"
) or []
except Exception as exc:
last_error = exc
if attempt < max_retries - 1:
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
continue
raise
continue
if last_error is not None:
raise last_error
raise RequestClientError(f"Unknown request error: {url}")
if not cities:
continue
def _get_text(self, url: str, *, timeout: int = 20, max_retries: int = 3, referer: str = SITE_BASE) -> str:
return self._request_text(
"GET",
url,
timeout=timeout,
max_retries=max_retries,
referer=referer,
)
province_map = {row.get('id'): row for row in provinces}
data = {}
for city in cities:
province_row = province_map.get(city.get('pid'), {}) or {}
data[str(city.get('code'))] = {
"name": city.get('city'),
"province": city.get('province'),
"province_name": province_row.get('province', city.get('province')),
}
print(f"[律图] 城市来源表: {table}, 城市数: {len(cities)}")
return data
def _post_text(
self,
url: str,
*,
data: Dict,
timeout: int = 20,
max_retries: int = 3,
referer: str = SITE_BASE,
) -> str:
return self._request_text(
"POST",
url,
timeout=timeout,
max_retries=max_retries,
referer=referer,
data=data,
)
if last_error:
print(f"[律图] 加载地区数据失败: {last_error}")
print("[律图] 无城市数据(已尝试 area_new/area2/area")
return {}
def _extract_area_data(self, text: str) -> List[Dict]:
match = re.search(
r"lvtuData\.areaData\s*=\s*(\[[\s\S]*?\])\s*;\s*lvtuData\.categroyData",
text,
re.S,
)
if not match:
return []
raw = match.group(1)
try:
data = json.loads(raw)
except Exception:
return []
return data if isinstance(data, list) else []
def discover_cities(self) -> List[CityTarget]:
text = self._get_text(AREA_DATA_URL, referer=SITE_BASE + "/findlawyer/")
provinces = self._extract_area_data(text)
targets: List[CityTarget] = []
seen_area: Set[str] = set()
for province in provinces:
province_id = str(province.get("id") or "").strip()
province_name = str(province.get("name") or "").strip()
province_py = str(province.get("py") or "").strip()
child_rows = province.get("child") or []
# 常规省份 child 是地级市;直辖市 child 是区县,此时使用省级 id 抓取
if child_rows and any((row.get("child") or []) for row in child_rows):
for city in child_rows:
area_id = str(city.get("id") or "").strip()
city_name = str(city.get("name") or "").strip()
city_py = str(city.get("py") or "").strip()
if not area_id or not city_name:
def _post(self, payload: Dict[str, str], max_retries: int = 3) -> Optional[str]:
for attempt in range(max_retries):
try:
resp = self.session.post(LIST_URL, data=payload, timeout=10, verify=False)
status_code = resp.status_code
text = resp.text
resp.close()
if status_code == 403:
if attempt < max_retries - 1:
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
self._refresh_session()
time.sleep(wait_time)
continue
if area_id in seen_area:
continue
seen_area.add(area_id)
targets.append(
CityTarget(
area_id=area_id,
province_id=province_id,
province_name=province_name,
province_py=province_py,
city_name=city_name,
city_py=city_py,
)
)
else:
if not province_id or not province_name:
continue
if province_id in seen_area:
continue
seen_area.add(province_id)
targets.append(
CityTarget(
area_id=province_id,
province_id=province_id,
province_name=province_name,
province_py=province_py,
city_name=province_name,
city_py=province_py,
)
)
print("请求失败: 403 Forbidden")
return None
if status_code >= 400:
raise requests.exceptions.HTTPError(f"{status_code} Error")
return text
except requests.exceptions.RequestException as exc:
print(f"请求失败: {exc}")
return None
return None
return targets
def _build_payload(self, area_id: str, page: int) -> Dict[str, str]:
ua = self.client.headers.get("User-Agent", "")
def _build_payload(self, city_code: str, page: int) -> Dict[str, str]:
return {
"AdCode": "",
"RegionId": str(area_id),
"RegionId": str(city_code),
"CategoryId": "",
"MaxNumber": "",
"OnlyData": "true",
"IgnoreButton": "",
"LawyerRecommendRequest[AreaId]": str(area_id),
"LawyerRecommendRequest[AreaId]": str(city_code),
"LawyerRecommendRequest[LawCategoryIds]": "",
"LawyerRecommendRequest[LawFirmPersonCount]": "",
"LawyerRecommendRequest[LawFirmScale]": "",
@@ -268,429 +190,163 @@ class Six4365Crawler:
"LawyerRecommendRequest[RefferUrl]": "",
"LawyerRecommendRequest[RequestUrl]": "https://m.64365.com/findlawyer/",
"LawyerRecommendRequest[resource_type_name]": "",
"LawyerRecommendRequest[UserAgent]": ua,
"LawyerRecommendRequest[UserAgent]": self.session.headers["User-Agent"],
"LawyerRecommendRequest[AddLawyerWithNoData]": "false",
"ShowCaseButton": "true",
}
def fetch_list_html(self, target: CityTarget, page: int) -> str:
payload = self._build_payload(target.area_id, page)
return self._post_text(
LIST_API_URL,
data=payload,
referer=SITE_BASE + "/findlawyer/",
)
def parse_list_cards(self, html: str) -> List[ListCard]:
def _parse_list(self, html: str, province: str, city: str) -> int:
soup = BeautifulSoup(html, "html.parser")
cards: List[ListCard] = []
seen: Set[str] = set()
lawyers = soup.find_all("a", class_="lawyer")
if not lawyers:
return 0
for anchor in soup.select("a.lawyer[href]"):
href = (anchor.get("href") or "").strip()
detail_urls: List[str] = []
for lawyer in lawyers:
href = lawyer.get("href")
if not href:
continue
detail_url = urljoin(SITE_BASE, href)
if detail_url in seen:
detail_urls.append(f"{href.rstrip('/')}/info/")
if not detail_urls:
return 0
results: List[Dict[str, str]] = []
with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls]
for fut in as_completed(futs):
try:
data = fut.result()
except Exception as exc:
print(f" 详情解析异常: {exc}")
continue
if data:
results.append(data)
if not results:
return len(detail_urls)
existing = self._existing_urls([r.get("url", "") for r in results if r.get("url")])
for data in results:
if not data:
continue
seen.add(detail_url)
url = data.get("url", "")
if not url:
continue
if url in existing:
print(f" -- 已存在URL: {url}")
continue
try:
self.db.insert_data("lawyer", data)
print(f" -> 新增: {data['name']} ({data['phone']})")
except Exception as exc:
print(f" 插入失败 {url}: {exc}")
name = ""
name_tag = anchor.select_one("b.name")
if name_tag:
name = name_tag.get_text(strip=True)
return len(detail_urls)
specialties: List[str] = []
skill_tag = anchor.select_one("div.skill")
if skill_tag:
raw = skill_tag.get_text(" ", strip=True).replace("擅长:", "")
specialties = [x.strip() for x in re.split(r"[、,]", raw) if x.strip()]
def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
html = self._get_detail(url)
if not html:
return None
score_text = ""
score_tag = anchor.select_one("div.info span[title='评分'] em")
if score_tag:
score_text = score_tag.get_text(strip=True)
service_text = ""
service_tag = anchor.select_one("div.info")
if service_tag:
service_text = service_tag.get_text(" ", strip=True)
cards.append(
ListCard(
detail_url=detail_url,
name=name,
specialties=specialties,
score_text=score_text,
service_text=service_text,
)
)
return cards
def parse_detail(self, detail_url: str) -> Dict:
info_url = detail_url.rstrip("/") + "/info/"
html = self._get_text(info_url, referer=detail_url)
soup = BeautifulSoup(html, "html.parser")
base_info = soup.find("ul", class_="intro-basic-bar")
if not base_info:
return None
name = ""
law_firm = ""
phone = ""
practice_years: Optional[int] = None
office_area = ""
address = ""
specialties: List[str] = []
for li in soup.select("ul.intro-basic-bar li"):
label_tag = li.select_one("span.label")
value_tag = li.select_one("div.txt")
if not label_tag or not value_tag:
for li in base_info.find_all("li"):
label = li.find("span", class_="label")
txt = li.find("div", class_="txt")
if not label or not txt:
continue
label_text = label.get_text(strip=True)
if "姓名" in label_text:
name = txt.get_text(strip=True)
if "执业律所" in label_text:
law_firm = txt.get_text(strip=True)
label = label_tag.get_text(" ", strip=True).replace("", "")
value = value_tag.get_text(" ", strip=True)
more_section = soup.find("div", class_="more-intro-basic")
if more_section:
phone_ul = more_section.find("ul", class_="intro-basic-bar")
if phone_ul:
for li in phone_ul.find_all("li"):
label = li.find("span", class_="label")
txt = li.find("div", class_="txt")
if label and txt and "联系电话" in label.get_text(strip=True):
phone = txt.get_text(strip=True).replace(" ", "")
break
if "姓名" in label and not name:
name = value
elif "执业律所" in label and not law_firm:
law_firm = value
elif "联系电话" in label and not phone:
phone = normalize_phone(value)
elif "执业年限" in label and practice_years is None:
year_match = YEAR_RE.search(value)
if year_match:
try:
practice_years = int(year_match.group(1))
except Exception:
practice_years = None
elif "办公地区" in label and not office_area:
office_area = value
elif "办公地址" in label and not address:
address = value
text = soup.get_text(" ", strip=True)
if not phone:
phone = normalize_phone(text)
if not name and soup.title:
title = soup.title.get_text(" ", strip=True)
match = re.search(r"([^\s_,。]+?)律师", title)
if match:
name = match.group(1).strip()
skill_match = re.search(r"擅长:([^\n]+)", text)
if skill_match:
specialties = [x.strip() for x in re.split(r"[、,]", skill_match.group(1)) if x.strip()]
return {
"name": name,
"law_firm": law_firm,
"phone": phone,
"practice_years": practice_years,
"office_area": office_area,
"address": address,
"specialties": specialties,
"detail_url": detail_url,
"info_url": info_url,
}
def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
seen_detail_urls: Set[str] = set()
page_first_seen: Set[str] = set()
for page in range(1, self.max_pages + 1):
try:
html = self.fetch_list_html(target, page)
except Exception as exc:
print(f"[list] 失败 area={target.area_id} p{page}: {exc}")
break
cards = self.parse_list_cards(html)
if not cards:
break
first_url = cards[0].detail_url
if first_url in page_first_seen:
break
page_first_seen.add(first_url)
for card in cards:
if card.detail_url in seen_detail_urls:
continue
seen_detail_urls.add(card.detail_url)
try:
detail = self.parse_detail(card.detail_url)
except Exception as exc:
print(f"[detail] 失败 {card.detail_url}: {exc}")
continue
now = int(time.time())
uid_match = re.search(r"/lawyer/(\d+)/", card.detail_url)
uid = uid_match.group(1) if uid_match else card.detail_url
record_id = hashlib.md5(uid.encode("utf-8")).hexdigest()
yield {
"record_id": record_id,
"collected_at": now,
"source": {
"site": SITE_NAME,
"province_id": target.province_id,
"province": target.province_name,
"province_py": target.province_py,
"area_id": target.area_id,
"city": target.city_name,
"city_py": target.city_py,
"page": page,
"detail_url": card.detail_url,
"info_url": detail.get("info_url", ""),
},
"list_snapshot": {
"name": card.name,
"specialties": card.specialties,
"score_text": card.score_text,
"service_text": card.service_text,
},
"profile": {
"name": detail.get("name") or card.name,
"law_firm": detail.get("law_firm") or "",
"phone": detail.get("phone") or "",
"practice_years": detail.get("practice_years"),
"office_area": detail.get("office_area") or "",
"address": detail.get("address") or "",
"specialties": detail.get("specialties") or card.specialties,
},
}
if self.sleep_seconds:
time.sleep(self.sleep_seconds)
def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
source = record.get("source", {}) or {}
profile = record.get("profile", {}) or {}
phone = normalize_phone(profile.get("phone", ""))
if not phone:
phone = phone.replace('-', '').strip()
if not name or not phone:
return None
province = (source.get("province") or "").strip()
city = (source.get("city") or province).strip()
return {
"name": (profile.get("name") or "").strip(),
"law_firm": (profile.get("law_firm") or "").strip(),
data = {
"phone": phone,
"province": province,
"city": city,
"phone": phone,
"url": (source.get("info_url") or source.get("detail_url") or "").strip(),
"domain": LEGACY_DOMAIN,
"create_time": int(record.get("collected_at") or time.time()),
"params": json.dumps(record, ensure_ascii=False),
"law_firm": law_firm,
"url": url,
"domain": DOMAIN,
"name": name,
"create_time": int(time.time()),
"params": json.dumps({"province": province, "city": city}, ensure_ascii=False)
}
return data
def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
if not self.db or not phones:
return set()
deduped = sorted({p for p in phones if p})
if not deduped:
return set()
existing: Set[str] = set()
cur = self.db.db.cursor()
try:
chunk_size = 500
for i in range(0, len(deduped), chunk_size):
chunk = deduped[i:i + chunk_size]
placeholders = ",".join(["%s"] * len(chunk))
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
for row in cur.fetchall():
existing.add(row[0])
finally:
cur.close()
return existing
def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
if not self.db:
return 0, 0
rows: List[Dict[str, str]] = []
for record in records:
row = self._to_legacy_lawyer_row(record)
if row:
rows.append(row)
if not rows:
return 0, 0
existing = self._existing_phones_in_db([row["phone"] for row in rows])
inserted = 0
skipped = 0
for row in rows:
phone = row.get("phone", "")
if not phone or phone in existing:
skipped += 1
continue
def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]:
session = self._get_thread_session()
for attempt in range(max_retries):
try:
self.db.insert_data("lawyer", row)
existing.add(phone)
inserted += 1
except Exception as exc:
skipped += 1
print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
return inserted, skipped
def crawl(
self,
output_path: str,
max_cities: int = 0,
city_filter: Optional[str] = None,
) -> None:
cities = self.discover_cities()
print(f"[discover] 共发现地区 {len(cities)}")
if city_filter:
key = city_filter.strip().lower()
cities = [
c for c in cities
if key in c.city_name.lower() or key in c.city_py.lower() or key in c.area_id
]
print(f"[discover] 过滤后地区 {len(cities)} 个, filter={city_filter}")
if max_cities > 0:
cities = cities[:max_cities]
print(f"[discover] 截断地区数 {len(cities)}")
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
seen_ids: Set[str] = set()
if os.path.exists(output_path):
with open(output_path, "r", encoding="utf-8") as old_file:
for line in old_file:
line = line.strip()
if not line:
resp = session.get(url, timeout=10, verify=False)
status_code = resp.status_code
text = resp.text
resp.close()
if status_code == 403:
if attempt < max_retries - 1:
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
print(f" 403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
self._refresh_thread_session()
session = self._get_thread_session()
time.sleep(wait_time)
continue
try:
item = json.loads(line)
except Exception:
continue
rid = item.get("record_id")
if rid:
seen_ids.add(rid)
print(f"[resume] 已有记录 {len(seen_ids)}")
print(" 请求失败: 403 Forbidden")
return None
if status_code >= 400:
raise requests.exceptions.HTTPError(f"{status_code} Error")
return text
except requests.exceptions.RequestException as exc:
print(f" 请求失败: {exc}")
return None
return None
total_new_json = 0
total_new_db = 0
total_skip_db = 0
def run(self):
print("启动律图采集...")
if not self.cities:
print("无城市数据")
return
with open(output_path, "a", encoding="utf-8") as out:
for idx, target in enumerate(cities, start=1):
print(
f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
f"(area={target.area_id})"
)
city_records = list(self.crawl_city(target))
city_new_json = 0
for record in city_records:
rid = record["record_id"]
if rid in seen_ids:
continue
out.write(json.dumps(record, ensure_ascii=False) + "\n")
seen_ids.add(rid)
city_new_json += 1
total_new_json += 1
city_new_db, city_skip_db = self._write_records_to_db(city_records)
total_new_db += city_new_db
total_skip_db += city_skip_db
print(
f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
f"DB新增{city_new_db}条, DB跳过{city_skip_db}"
)
print(
f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
f"DB跳过{total_skip_db}条, 输出: {output_path}"
)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="律图全新采集脚本(站点数据直采)")
parser.add_argument(
"--output",
default="/www/wwwroot/lawyers/data/six4365_records_all.jsonl",
help="输出 jsonl 文件路径",
)
parser.add_argument(
"--max-cities",
type=int,
default=0,
help="最多采集多少个地区,0 表示不限",
)
parser.add_argument(
"--max-pages",
type=int,
default=9999,
help="每个地区最多采集多少页",
)
parser.add_argument(
"--city-filter",
default="",
help="按城市名称/拼音/编码过滤",
)
parser.add_argument(
"--sleep",
type=float,
default=0.1,
help="详情页请求间隔秒数",
)
parser.add_argument(
"--direct",
action="store_true",
help="直连模式,不使用 proxy_settings.json 代理",
)
parser.add_argument(
"--no-db",
action="store_true",
help="只输出 JSONL,不写入数据库",
)
return parser.parse_args()
def main():
args = parse_args()
if args.no_db:
crawler = Six4365Crawler(
max_pages=args.max_pages,
sleep_seconds=args.sleep,
use_proxy=not args.direct,
db_connection=None,
)
crawler.crawl(
output_path=args.output,
max_cities=args.max_cities,
city_filter=args.city_filter or None,
)
return
with Db() as db:
crawler = Six4365Crawler(
max_pages=args.max_pages,
sleep_seconds=args.sleep,
use_proxy=not args.direct,
db_connection=db,
)
crawler.crawl(
output_path=args.output,
max_cities=args.max_cities,
city_filter=args.city_filter or None,
)
for city_code, info in self.cities.items():
province = info.get("province_name", "")
city = info.get("name", "")
print(f"采集 {province}-{city}")
page = 1
while True:
payload = self._build_payload(city_code, page)
html = self._post(payload)
if not html:
break
link_count = self._parse_list(html, province, city)
if link_count == 0:
break
page += 1
print("律图采集完成")
if __name__ == "__main__":
main()
with Db() as db:
spider = Six4365Spider(db)
spider.run()
+8 -75
View File
@@ -1,80 +1,13 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
LOG_DIR="${PROJECT_ROOT}/logs"
DATA_DIR="${PROJECT_ROOT}/data"
# 切换到脚本所在目录,确保相对路径正确
cd "$(dirname "$0")"
mkdir -p "${LOG_DIR}" "${DATA_DIR}"
echo "使用 request/proxy_settings.json 读取代理配置"
if [[ -x "${PROJECT_ROOT}/.venv/bin/python" ]]; then
PYTHON_BIN="${PROJECT_ROOT}/.venv/bin/python"
else
PYTHON_BIN="python3"
fi
RUN_MODE="${RUN_MODE:-parallel}" # parallel | sequential
echo "[start] project=${PROJECT_ROOT}"
echo "[start] python=${PYTHON_BIN}"
echo "[start] mode=${RUN_MODE}"
echo "[start] proxy=request/proxy_settings.json"
# 大律师(新结构采集 + 写库)可通过环境变量控制
DLS_OUTPUT="${DLS_OUTPUT:-${DATA_DIR}/dls_records.jsonl}"
DLS_MAX_CITIES="${DLS_MAX_CITIES:-0}"
DLS_MAX_PAGES="${DLS_MAX_PAGES:-0}"
DLS_SLEEP="${DLS_SLEEP:-0.2}"
DLS_CITY_FILTER="${DLS_CITY_FILTER:-}"
DLS_EXTRA_ARGS=()
if [[ "${DLS_MAX_CITIES}" != "0" ]]; then
DLS_EXTRA_ARGS+=(--max-cities "${DLS_MAX_CITIES}")
fi
if [[ "${DLS_MAX_PAGES}" != "0" ]]; then
DLS_EXTRA_ARGS+=(--max-pages "${DLS_MAX_PAGES}")
fi
if [[ -n "${DLS_CITY_FILTER}" ]]; then
DLS_EXTRA_ARGS+=(--city-filter "${DLS_CITY_FILTER}")
fi
DLS_EXTRA_ARGS+=(--sleep "${DLS_SLEEP}" --output "${DLS_OUTPUT}")
if [[ "${DLS_DIRECT:-0}" == "1" ]]; then
DLS_EXTRA_ARGS+=(--direct)
fi
if [[ "${DLS_NO_DB:-0}" == "1" ]]; then
DLS_EXTRA_ARGS+=(--no-db)
fi
run_bg() {
local name="$1"
shift
local logfile="${LOG_DIR}/${name}.log"
nohup env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1 &
echo "[start] ${name} pid=$! log=${logfile}"
}
run_fg() {
local name="$1"
shift
local logfile="${LOG_DIR}/${name}.log"
echo "[start] ${name} fg log=${logfile}"
env PYTHONUNBUFFERED=1 "$@" > "${logfile}" 2>&1
}
if [[ "${RUN_MODE}" == "sequential" ]]; then
run_fg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}"
run_fg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py"
run_fg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py"
run_fg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py"
run_fg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py"
echo "[done] sequential completed"
else
run_bg "dls_fresh" "${PYTHON_BIN}" "${SCRIPT_DIR}/dls_fresh.py" "${DLS_EXTRA_ARGS[@]}"
run_bg "findlaw" "${PYTHON_BIN}" "${SCRIPT_DIR}/findlaw.py"
run_bg "lawtime" "${PYTHON_BIN}" "${SCRIPT_DIR}/lawtime.py"
run_bg "six4365" "${PYTHON_BIN}" "${SCRIPT_DIR}/six4365.py"
run_bg "hualv" "${PYTHON_BIN}" "${SCRIPT_DIR}/hualv.py"
echo "[done] all crawlers started in background"
fi
nohup python ../common_sites/dls.py > dls.log 2>&1 & # 大律师
nohup python ../common_sites/findlaw.py > findlaw.log 2>&1 & # 找法网
nohup python ../common_sites/lawtime.py > lawtime.log 2>&1 & # 法律快车
nohup python ../common_sites/six4365.py > six4365.log 2>&1 & # 律图
nohup python ../common_sites/hualv.py > hualv.log 2>&1 & # 华律