chore: initialize lawyers crawler project
This commit is contained in:
@@ -0,0 +1,278 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import random
|
||||
from typing import Dict, Optional, List, Set
|
||||
from urllib.parse import urljoin
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import threading
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(current_dir)
|
||||
request_dir = os.path.join(project_root, "request")
|
||||
if request_dir not in sys.path:
|
||||
sys.path.insert(0, request_dir)
|
||||
if project_root not in sys.path:
|
||||
sys.path.append(project_root)
|
||||
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
from request.requests_client import RequestClientError, RequestsClient
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
from Db import Db
|
||||
from config import LAWTIME_CONFIG
|
||||
|
||||
LIST_BASE = "https://m.lawtime.cn/{pinyin}/lawyer/?page={page}"
|
||||
DETAIL_BASE = "https://m.lawtime.cn"
|
||||
DOMAIN = "法律快车"
|
||||
|
||||
|
||||
class LawtimeSpider:
|
||||
def __init__(self, db_connection):
|
||||
self.db = db_connection
|
||||
self.client = self._build_session()
|
||||
self.max_workers = int(os.getenv("SPIDER_WORKERS", "8"))
|
||||
self._tls = threading.local()
|
||||
|
||||
def _build_session(self) -> RequestsClient:
|
||||
headers = LAWTIME_CONFIG.get("HEADERS", {})
|
||||
custom_headers = dict(headers) if headers else {}
|
||||
custom_headers.setdefault("Connection", "close")
|
||||
return RequestsClient(headers=custom_headers)
|
||||
|
||||
def _refresh_session(self) -> None:
|
||||
self.client.refresh()
|
||||
|
||||
def _get_thread_session(self) -> RequestsClient:
|
||||
s = getattr(self._tls, "session", None)
|
||||
if s is not None:
|
||||
return s
|
||||
s = self.client.clone()
|
||||
self._tls.session = s
|
||||
return s
|
||||
|
||||
def _refresh_thread_session(self) -> None:
|
||||
s = getattr(self._tls, "session", None)
|
||||
if s is not None:
|
||||
s.close()
|
||||
self._tls.session = None
|
||||
|
||||
def _existing_phones(self, phones: List[str]) -> Set[str]:
|
||||
if not phones:
|
||||
return set()
|
||||
existing: Set[str] = set()
|
||||
cur = self.db.db.cursor()
|
||||
try:
|
||||
chunk_size = 500
|
||||
for i in range(0, len(phones), chunk_size):
|
||||
chunk = phones[i:i + chunk_size]
|
||||
placeholders = ",".join(["%s"] * len(chunk))
|
||||
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||||
cur.execute(sql, [DOMAIN, *chunk])
|
||||
for row in cur.fetchall():
|
||||
existing.add(row[0])
|
||||
finally:
|
||||
cur.close()
|
||||
return existing
|
||||
|
||||
def _load_areas(self):
|
||||
condition = "level = 2 and domain='法律快车'"
|
||||
tables = ("area_new", "area", "area2")
|
||||
last_error = None
|
||||
for table in tables:
|
||||
try:
|
||||
rows = self.db.select_data(table, "pinyin, province, city", condition) or []
|
||||
except Exception as exc:
|
||||
last_error = exc
|
||||
continue
|
||||
if rows:
|
||||
missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
|
||||
print(f"[法律快车] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
|
||||
return rows
|
||||
|
||||
if last_error:
|
||||
print(f"[法律快车] 加载地区数据失败: {last_error}")
|
||||
print("[法律快车] 无城市数据(已尝试 area_new/area/area2)")
|
||||
return []
|
||||
|
||||
def _get(self, url: str, max_retries: int = 3) -> Optional[str]:
|
||||
return self._get_with_session(self.client, url, max_retries=max_retries, is_thread=False)
|
||||
|
||||
def _get_with_session(self, session: RequestsClient, url: str, max_retries: int = 3, is_thread: bool = False) -> Optional[str]:
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
resp = session.get_text(url, timeout=15, verify=False)
|
||||
status_code = resp.status_code
|
||||
text = resp.text
|
||||
if status_code == 403:
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||||
print(f"请求失败 {url}: 403,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
|
||||
if is_thread:
|
||||
self._refresh_thread_session()
|
||||
session = self._get_thread_session()
|
||||
else:
|
||||
self._refresh_session()
|
||||
session = self.client
|
||||
time.sleep(wait_time)
|
||||
continue
|
||||
print(f"请求失败 {url}: 403 Forbidden")
|
||||
return None
|
||||
if status_code >= 400:
|
||||
raise RequestClientError(f"{status_code} Error: {url}")
|
||||
return text
|
||||
except RequestClientError as exc:
|
||||
print(f"请求失败 {url}: {exc}")
|
||||
return None
|
||||
return None
|
||||
|
||||
def _parse_list(self, html: str, province: str, city: str) -> int:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
links = [a.get("href", "") for a in soup.select("a.hide_link")]
|
||||
links = [link.replace("lll", "int") for link in links if link]
|
||||
if not links:
|
||||
return 0
|
||||
|
||||
detail_urls = [urljoin(DETAIL_BASE, link) for link in links]
|
||||
|
||||
results: List[Dict[str, str]] = []
|
||||
with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
|
||||
futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls]
|
||||
for fut in as_completed(futs):
|
||||
try:
|
||||
data = fut.result()
|
||||
except Exception as exc:
|
||||
print(f" 详情解析异常: {exc}")
|
||||
continue
|
||||
if data and data.get("phone"):
|
||||
results.append(data)
|
||||
|
||||
if not results:
|
||||
return len(detail_urls)
|
||||
|
||||
phones = [d["phone"] for d in results if d.get("phone")]
|
||||
existing = self._existing_phones(phones)
|
||||
|
||||
for data in results:
|
||||
phone = data.get("phone")
|
||||
if not phone:
|
||||
continue
|
||||
if phone in existing:
|
||||
print(f" -- 已存在: {data['name']} ({phone})")
|
||||
continue
|
||||
try:
|
||||
self.db.insert_data("lawyer", data)
|
||||
print(f" -> 新增: {data['name']} ({phone})")
|
||||
except Exception as exc:
|
||||
print(f" 插入失败 {data.get('url')}: {exc}")
|
||||
|
||||
return len(detail_urls)
|
||||
|
||||
def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
|
||||
html = None
|
||||
sess = self._get_thread_session()
|
||||
html = self._get_with_session(sess, url, max_retries=3, is_thread=True)
|
||||
if not html:
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
text = soup.get_text(" ")
|
||||
|
||||
name = ""
|
||||
title_tag = soup.find("title")
|
||||
if title_tag:
|
||||
match = re.search(r"(\S+)律师", title_tag.get_text())
|
||||
if match:
|
||||
name = match.group(1)
|
||||
if not name:
|
||||
intl_div = soup.find("div", class_="intl")
|
||||
if intl_div:
|
||||
match = re.search(r"(\S+)律师", intl_div.get_text())
|
||||
if match:
|
||||
name = match.group(1)
|
||||
|
||||
phone = ""
|
||||
phone_pattern = r"1[3-9]\d{9}"
|
||||
for item in soup.select("div.item.flex"):
|
||||
label = item.find("div", class_="label")
|
||||
desc = item.find("div", class_="desc")
|
||||
if not label or not desc:
|
||||
continue
|
||||
label_text = label.get_text()
|
||||
desc_text = desc.get_text().replace("-", "")
|
||||
if "联系电话" in label_text or "电话" in label_text:
|
||||
matches = re.findall(phone_pattern, desc_text)
|
||||
if matches:
|
||||
phone = matches[0]
|
||||
break
|
||||
if not phone:
|
||||
matches = re.findall(phone_pattern, text.replace("-", ""))
|
||||
if matches:
|
||||
phone = matches[0]
|
||||
if not phone:
|
||||
print(f" 无手机号: {url}")
|
||||
return None
|
||||
|
||||
law_firm = ""
|
||||
for item in soup.select("div.item.flex"):
|
||||
label = item.find("div", class_="label")
|
||||
desc = item.find("div", class_="desc")
|
||||
if not label or not desc:
|
||||
continue
|
||||
if "执业律所" in label.get_text() or "律所" in label.get_text():
|
||||
law_firm = desc.get_text(strip=True).replace("已认证", "")
|
||||
break
|
||||
|
||||
params = {
|
||||
"list_url": url,
|
||||
"province": province,
|
||||
"city": city,
|
||||
}
|
||||
|
||||
return {
|
||||
"name": name or "",
|
||||
"law_firm": law_firm,
|
||||
"province": province,
|
||||
"city": city,
|
||||
"phone": phone,
|
||||
"url": url,
|
||||
"domain": DOMAIN,
|
||||
"create_time": int(time.time()),
|
||||
"params": json.dumps(params, ensure_ascii=False)
|
||||
}
|
||||
|
||||
def run(self):
|
||||
print("启动法律快车采集...")
|
||||
areas = self._load_areas()
|
||||
if not areas:
|
||||
print("无地区数据")
|
||||
return
|
||||
|
||||
for area in areas:
|
||||
pinyin = area.get("pinyin")
|
||||
province = area.get("province", "")
|
||||
city = area.get("city", "")
|
||||
if not pinyin:
|
||||
continue
|
||||
page = 1
|
||||
while True:
|
||||
list_url = LIST_BASE.format(pinyin=pinyin, page=page)
|
||||
print(f"采集 {province}-{city} 第 {page} 页: {list_url}")
|
||||
html = self._get(list_url)
|
||||
if not html:
|
||||
break
|
||||
link_count = self._parse_list(html, province, city)
|
||||
if link_count == 0:
|
||||
break
|
||||
page += 1
|
||||
print("法律快车采集完成")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
with Db() as db:
|
||||
spider = LawtimeSpider(db)
|
||||
spider.run()
|
||||
Reference in New Issue
Block a user