333 lines
12 KiB
Python
333 lines
12 KiB
Python
import json
|
||
import os
|
||
import sys
|
||
import time
|
||
import random
|
||
from typing import Dict, Optional, List, Set
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
import threading
|
||
|
||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||
project_root = os.path.dirname(current_dir)
|
||
request_dir = os.path.join(project_root, "request")
|
||
if request_dir not in sys.path:
|
||
sys.path.insert(0, request_dir)
|
||
if project_root not in sys.path:
|
||
sys.path.append(project_root)
|
||
|
||
import urllib3
|
||
from bs4 import BeautifulSoup
|
||
from request.requests_client import RequestClientError, RequestsClient
|
||
|
||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||
|
||
from Db import Db
|
||
|
||
DOMAIN = "律图"
|
||
LIST_URL = "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/"
|
||
|
||
|
||
class Six4365Spider:
|
||
def __init__(self, db_connection):
|
||
self.db = db_connection
|
||
self.client = self._build_session()
|
||
self.max_workers = int(os.getenv("SPIDER_WORKERS", "8"))
|
||
self._tls = threading.local()
|
||
self.cities = self._load_cities()
|
||
|
||
def _build_session(self) -> RequestsClient:
|
||
return RequestsClient(headers={
|
||
"User-Agent": (
|
||
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
||
"Mobile/15E148 Safari/604.1"
|
||
),
|
||
"Connection": "close",
|
||
})
|
||
|
||
def _refresh_session(self) -> None:
|
||
self.client.refresh()
|
||
|
||
def _get_thread_session(self) -> RequestsClient:
|
||
"""每个线程使用独立请求客户端(共享相同 headers/代理配置)。"""
|
||
s = getattr(self._tls, "session", None)
|
||
if s is not None:
|
||
return s
|
||
s = self.client.clone()
|
||
self._tls.session = s
|
||
return s
|
||
|
||
def _refresh_thread_session(self) -> None:
|
||
s = getattr(self._tls, "session", None)
|
||
if s is not None:
|
||
s.close()
|
||
self._tls.session = None
|
||
|
||
def _existing_urls(self, urls: List[str]) -> Set[str]:
|
||
"""批量查重,减少 N 次 is_data_exist"""
|
||
if not urls:
|
||
return set()
|
||
existing: Set[str] = set()
|
||
cur = self.db.db.cursor()
|
||
try:
|
||
# IN 参数过多会失败,分批
|
||
chunk_size = 500
|
||
for i in range(0, len(urls), chunk_size):
|
||
chunk = urls[i:i + chunk_size]
|
||
placeholders = ",".join(["%s"] * len(chunk))
|
||
sql = f"SELECT url FROM lawyer WHERE url IN ({placeholders})"
|
||
cur.execute(sql, chunk)
|
||
for row in cur.fetchall():
|
||
# pymysql 默认返回 tuple
|
||
existing.add(row[0])
|
||
finally:
|
||
cur.close()
|
||
return existing
|
||
|
||
def _load_cities(self):
|
||
tables = ("area_new", "area2", "area")
|
||
last_error = None
|
||
for table in tables:
|
||
try:
|
||
provinces = self.db.select_data(
|
||
table,
|
||
"id, code, province",
|
||
"domain='64365' AND level=1"
|
||
) or []
|
||
cities = self.db.select_data(
|
||
table,
|
||
"code, city, province, pid",
|
||
"domain='64365' AND level=2"
|
||
) or []
|
||
except Exception as exc:
|
||
last_error = exc
|
||
continue
|
||
|
||
if not cities:
|
||
continue
|
||
|
||
province_map = {row.get('id'): row for row in provinces}
|
||
data = {}
|
||
for city in cities:
|
||
province_row = province_map.get(city.get('pid'), {}) or {}
|
||
data[str(city.get('code'))] = {
|
||
"name": city.get('city'),
|
||
"province": city.get('province'),
|
||
"province_name": province_row.get('province', city.get('province')),
|
||
}
|
||
print(f"[律图] 城市来源表: {table}, 城市数: {len(cities)}")
|
||
return data
|
||
|
||
if last_error:
|
||
print(f"[律图] 加载地区数据失败: {last_error}")
|
||
print("[律图] 无城市数据(已尝试 area_new/area2/area)")
|
||
return {}
|
||
|
||
def _post(self, payload: Dict[str, str], max_retries: int = 3) -> Optional[str]:
|
||
for attempt in range(max_retries):
|
||
try:
|
||
resp = self.client.post_text(LIST_URL, data=payload, timeout=10, verify=False)
|
||
status_code = resp.status_code
|
||
text = resp.text
|
||
if status_code == 403:
|
||
if attempt < max_retries - 1:
|
||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
|
||
self._refresh_session()
|
||
time.sleep(wait_time)
|
||
continue
|
||
print("请求失败: 403 Forbidden")
|
||
return None
|
||
if status_code >= 400:
|
||
raise RequestClientError(f"{status_code} Error")
|
||
return text
|
||
except RequestClientError as exc:
|
||
print(f"请求失败: {exc}")
|
||
return None
|
||
return None
|
||
|
||
def _build_payload(self, city_code: str, page: int) -> Dict[str, str]:
|
||
return {
|
||
"AdCode": "",
|
||
"RegionId": str(city_code),
|
||
"CategoryId": "",
|
||
"MaxNumber": "",
|
||
"OnlyData": "true",
|
||
"IgnoreButton": "",
|
||
"LawyerRecommendRequest[AreaId]": str(city_code),
|
||
"LawyerRecommendRequest[LawCategoryIds]": "",
|
||
"LawyerRecommendRequest[LawFirmPersonCount]": "",
|
||
"LawyerRecommendRequest[LawFirmScale]": "",
|
||
"LawyerRecommendRequest[OrderType]": "0",
|
||
"LawyerRecommendRequest[PageIndex]": str(page),
|
||
"LawyerRecommendRequest[PageSize]": "10",
|
||
"LawyerRecommendRequest[TagId]": "",
|
||
"LawyerRecommendRequest[Type]": "1",
|
||
"LawyerRecommendRequest[AccountType]": "",
|
||
"LawyerRecommendRequest[AddLawyer]": "true",
|
||
"LawyerRecommendRequest[Content]": "",
|
||
"LawyerRecommendRequest[Duty]": "",
|
||
"LawyerRecommendRequest[ExcludeLawyerIds][]": "",
|
||
"LawyerRecommendRequest[RefferUrl]": "",
|
||
"LawyerRecommendRequest[RequestUrl]": "https://m.64365.com/findlawyer/",
|
||
"LawyerRecommendRequest[resource_type_name]": "",
|
||
"LawyerRecommendRequest[UserAgent]": self.client.headers["User-Agent"],
|
||
"LawyerRecommendRequest[AddLawyerWithNoData]": "false",
|
||
"ShowCaseButton": "true",
|
||
}
|
||
|
||
def _parse_list(self, html: str, province: str, city: str) -> int:
|
||
soup = BeautifulSoup(html, "html.parser")
|
||
lawyers = soup.find_all("a", class_="lawyer")
|
||
if not lawyers:
|
||
return 0
|
||
|
||
detail_urls: List[str] = []
|
||
for lawyer in lawyers:
|
||
href = lawyer.get("href")
|
||
if not href:
|
||
continue
|
||
detail_urls.append(f"{href.rstrip('/')}/info/")
|
||
|
||
if not detail_urls:
|
||
return 0
|
||
|
||
results: List[Dict[str, str]] = []
|
||
with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
|
||
futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls]
|
||
for fut in as_completed(futs):
|
||
try:
|
||
data = fut.result()
|
||
except Exception as exc:
|
||
print(f" 详情解析异常: {exc}")
|
||
continue
|
||
if data:
|
||
results.append(data)
|
||
|
||
if not results:
|
||
return len(detail_urls)
|
||
|
||
existing = self._existing_urls([r.get("url", "") for r in results if r.get("url")])
|
||
for data in results:
|
||
if not data:
|
||
continue
|
||
url = data.get("url", "")
|
||
if not url:
|
||
continue
|
||
if url in existing:
|
||
print(f" -- 已存在URL: {url}")
|
||
continue
|
||
try:
|
||
self.db.insert_data("lawyer", data)
|
||
print(f" -> 新增: {data['name']} ({data['phone']})")
|
||
except Exception as exc:
|
||
print(f" 插入失败 {url}: {exc}")
|
||
|
||
return len(detail_urls)
|
||
|
||
def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
|
||
html = self._get_detail(url)
|
||
if not html:
|
||
return None
|
||
|
||
soup = BeautifulSoup(html, "html.parser")
|
||
base_info = soup.find("ul", class_="intro-basic-bar")
|
||
if not base_info:
|
||
return None
|
||
|
||
name = ""
|
||
law_firm = ""
|
||
phone = ""
|
||
|
||
for li in base_info.find_all("li"):
|
||
label = li.find("span", class_="label")
|
||
txt = li.find("div", class_="txt")
|
||
if not label or not txt:
|
||
continue
|
||
label_text = label.get_text(strip=True)
|
||
if "姓名" in label_text:
|
||
name = txt.get_text(strip=True)
|
||
if "执业律所" in label_text:
|
||
law_firm = txt.get_text(strip=True)
|
||
|
||
more_section = soup.find("div", class_="more-intro-basic")
|
||
if more_section:
|
||
phone_ul = more_section.find("ul", class_="intro-basic-bar")
|
||
if phone_ul:
|
||
for li in phone_ul.find_all("li"):
|
||
label = li.find("span", class_="label")
|
||
txt = li.find("div", class_="txt")
|
||
if label and txt and "联系电话" in label.get_text(strip=True):
|
||
phone = txt.get_text(strip=True).replace(" ", "")
|
||
break
|
||
|
||
phone = phone.replace('-', '').strip()
|
||
if not name or not phone:
|
||
return None
|
||
|
||
data = {
|
||
"phone": phone,
|
||
"province": province,
|
||
"city": city,
|
||
"law_firm": law_firm,
|
||
"url": url,
|
||
"domain": DOMAIN,
|
||
"name": name,
|
||
"create_time": int(time.time()),
|
||
"params": json.dumps({"province": province, "city": city}, ensure_ascii=False)
|
||
}
|
||
return data
|
||
|
||
def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]:
|
||
session = self._get_thread_session()
|
||
for attempt in range(max_retries):
|
||
try:
|
||
resp = session.get_text(url, timeout=10, verify=False)
|
||
status_code = resp.status_code
|
||
text = resp.text
|
||
if status_code == 403:
|
||
if attempt < max_retries - 1:
|
||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||
print(f" 403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
|
||
self._refresh_thread_session()
|
||
session = self._get_thread_session()
|
||
time.sleep(wait_time)
|
||
continue
|
||
print(" 请求失败: 403 Forbidden")
|
||
return None
|
||
if status_code >= 400:
|
||
raise RequestClientError(f"{status_code} Error")
|
||
return text
|
||
except RequestClientError as exc:
|
||
print(f" 请求失败: {exc}")
|
||
return None
|
||
return None
|
||
|
||
def run(self):
|
||
print("启动律图采集...")
|
||
if not self.cities:
|
||
print("无城市数据")
|
||
return
|
||
|
||
for city_code, info in self.cities.items():
|
||
province = info.get("province_name", "")
|
||
city = info.get("name", "")
|
||
print(f"采集 {province}-{city}")
|
||
page = 1
|
||
while True:
|
||
payload = self._build_payload(city_code, page)
|
||
html = self._post(payload)
|
||
if not html:
|
||
break
|
||
link_count = self._parse_list(html, province, city)
|
||
if link_count == 0:
|
||
break
|
||
page += 1
|
||
print("律图采集完成")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
with Db() as db:
|
||
spider = Six4365Spider(db)
|
||
spider.run()
|