Files
lawyers/services/area_sync_service.py
T

590 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
import os
import re
import sys
import threading
import time
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from typing import Any, Dict, Iterable, List, Set, Tuple
from urllib.parse import parse_qs, urlparse
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(current_dir)
if project_root not in sys.path:
sys.path.append(project_root)
from Db import Db
AREA_TABLE = os.getenv("AREA_TARGET_TABLE", "area_new")
AREA_DOMAIN = os.getenv("AREA_DOMAIN", "maxlaw")
DOUYIN_DOMAIN = os.getenv("DOUYIN_DOMAIN", "抖音")
DOUYIN_RAW_DIR = os.getenv("DOUYIN_RAW_DIR", os.path.join(project_root, "data", "douyin_raw"))
DOUYIN_SAVE_ONLY_ENV = os.getenv("DOUYIN_SAVE_ONLY", "1")
SERVICE_HOST = os.getenv("AREA_SERVICE_HOST", "0.0.0.0")
SERVICE_PORT = int(os.getenv("AREA_SERVICE_PORT", "9002"))
PHONE_REGEX = re.compile(r"(?:\+?86[-\s]?)?(1[3-9]\d{9})")
WX_CONTEXT_REGEX = re.compile(r"(?i)(?:微信|微.?信|wx|vx|weixin|v信|v号|v)\s*[:/\-\s]\s*([a-zA-Z0-9._-]{3,40})")
LAW_FIRM_REGEX = re.compile(r"([\u4e00-\u9fa5A-Za-z·]{2,40}律师事务所)")
RAW_WRITE_LOCK = threading.Lock()
def _is_safe_table_name(table_name: str) -> bool:
return bool(re.fullmatch(r"[A-Za-z0-9_]+", table_name or ""))
def _parse_int(value: Any, default: int = 0) -> int:
try:
return int(str(value).strip())
except Exception:
return default
def _parse_bool(value: Any, default: bool = False) -> bool:
if value is None:
return default
text = str(value).strip().lower()
if text in {"1", "true", "yes", "y", "on"}:
return True
if text in {"0", "false", "no", "n", "off"}:
return False
return default
def _first_param(params: Dict[str, List[str]], key: str, default: str = "") -> str:
values = params.get(key) or []
if not values:
return default
return values[0]
def _append_jsonl(file_path: str, payload: Dict[str, Any]) -> None:
os.makedirs(os.path.dirname(file_path) or ".", exist_ok=True)
line = json.dumps(payload, ensure_ascii=False)
with RAW_WRITE_LOCK:
with open(file_path, "a", encoding="utf-8") as out:
out.write(line)
out.write("\n")
def _save_raw_index_payload(payload: Dict[str, Any], query: Dict[str, List[str]], client_ip: str) -> str:
now_ts = int(time.time())
day = time.strftime("%Y%m%d", time.localtime(now_ts))
file_path = os.path.join(DOUYIN_RAW_DIR, f"douyin_index_{day}.jsonl")
wrapped = {
"received_at": now_ts,
"client_ip": client_ip,
"query": query,
"payload": payload,
}
_append_jsonl(file_path, wrapped)
return file_path
def _query_area_data(table_name: str, domain: str) -> List[Dict[str, Any]]:
if not _is_safe_table_name(table_name):
raise ValueError("非法表名")
with Db() as db:
cursor = db.db.cursor()
sql = (
f"SELECT province, city, name, pid, pinyin, code, domain, level, create_time "
f"FROM `{table_name}` WHERE domain=%s ORDER BY id ASC"
)
cursor.execute(sql, (domain,))
rows = cursor.fetchall()
cursor.close()
result: List[Dict[str, Any]] = []
for row in rows:
result.append(
{
"province": row[0] or "",
"city": row[1] or "",
"name": row[2] or "",
"pid": row[3] if row[3] is not None else 0,
"pinyin": row[4] or "",
"code": row[5] or "",
"domain": row[6] or "",
"level": row[7] if row[7] is not None else 0,
"create_time": row[8] if row[8] is not None else 0,
}
)
return result
def _iter_dict_nodes(value: Any) -> Iterable[Dict[str, Any]]:
stack: List[Any] = [value]
while stack:
current = stack.pop()
if isinstance(current, dict):
yield current
stack.extend(current.values())
elif isinstance(current, list):
stack.extend(current)
def _extract_phones_from_text(text: str) -> List[str]:
phones: List[str] = []
seen: Set[str] = set()
for match in PHONE_REGEX.finditer(text or ""):
phone = match.group(1)
if not phone or phone in seen:
continue
seen.add(phone)
phones.append(phone)
return phones
def _extract_phones_from_user_info(user_info: Dict[str, Any]) -> List[str]:
signature = str(user_info.get("signature") or "")
unique_id = str(user_info.get("unique_id") or "")
versatile = str(user_info.get("versatile_display") or "")
# 1) 优先从简介直接匹配手机号
phones = set(_extract_phones_from_text(signature))
if phones:
return sorted(phones)
# 2) 从微信相关标记中提取,再从抖音号字段兜底
for text in (signature, unique_id, versatile):
for match in WX_CONTEXT_REGEX.finditer(text):
wx_value = match.group(1) or ""
for phone in _extract_phones_from_text(wx_value):
phones.add(phone)
for text in (unique_id, versatile):
for phone in _extract_phones_from_text(text):
phones.add(phone)
return sorted(phones)
def _extract_law_firm_from_user_info(user_info: Dict[str, Any]) -> str:
candidates: List[str] = []
signature = str(user_info.get("signature") or "")
if signature:
candidates.append(signature)
verify_reason = str(user_info.get("enterprise_verify_reason") or "")
if verify_reason:
candidates.append(verify_reason)
cert_text = ""
account_cert_info = user_info.get("account_cert_info")
if isinstance(account_cert_info, str) and account_cert_info.strip():
try:
cert_obj = json.loads(account_cert_info)
if isinstance(cert_obj, dict):
cert_text = str(cert_obj.get("label_text") or "").strip()
except Exception:
cert_text = account_cert_info.strip()
if cert_text:
candidates.append(cert_text)
for text in candidates:
match = LAW_FIRM_REGEX.search(text)
if match:
return match.group(1)
return ""
def _pick_first_str(node: Dict[str, Any], keys: Tuple[str, ...]) -> str:
for key in keys:
value = node.get(key)
if isinstance(value, str):
text = value.strip()
if text:
return text
return ""
def _extract_name(node: Dict[str, Any]) -> str:
direct = _pick_first_str(node, ("name", "nickname", "nick_name", "author_name", "title", "account_name"))
if direct:
return direct
for nested_key in ("author", "user", "user_info", "profile", "account"):
nested = node.get(nested_key)
if isinstance(nested, dict):
nested_name = _pick_first_str(nested, ("name", "nickname", "nick_name", "author_name", "title"))
if nested_name:
return nested_name
return ""
def _extract_law_firm(node: Dict[str, Any]) -> str:
direct = _pick_first_str(
node,
(
"law_firm",
"firm",
"lawFirm",
"office",
"org_name",
"organization",
"company",
"enterprise",
),
)
if direct:
return direct
enterprise = node.get("enterprise")
if isinstance(enterprise, dict):
company_name = _pick_first_str(enterprise, ("name", "company_name", "enterprise_name"))
if company_name:
return company_name
return ""
def _extract_detail_url(node: Dict[str, Any], fallback_api_url: str) -> str:
url = _pick_first_str(node, ("share_url", "url", "web_url", "detail_url", "jump_url"))
if url:
return url
aweme_id = node.get("aweme_id") or node.get("item_id")
if aweme_id:
aid = str(aweme_id).strip()
if aid:
return f"https://www.douyin.com/video/{aid}"
sec_uid = node.get("sec_uid")
if sec_uid:
sec_uid_text = str(sec_uid).strip()
if sec_uid_text:
return f"https://www.douyin.com/user/{sec_uid_text}"
return fallback_api_url
def _city_from_index(city_index: int, table_name: str, domain: str) -> Tuple[str, str]:
if city_index < 0:
return "", ""
try:
areas = _query_area_data(table_name, domain)
except Exception:
return "", ""
if city_index >= len(areas):
return "", ""
area = areas[city_index]
province = str(area.get("province") or "").strip()
city = str(area.get("city") or province).strip()
return province, city
def _existing_phones(domain: str, phones: List[str]) -> Set[str]:
if not phones:
return set()
deduped = sorted({p for p in phones if p})
if not deduped:
return set()
existing: Set[str] = set()
with Db() as db:
cursor = db.db.cursor()
chunk_size = 500
for i in range(0, len(deduped), chunk_size):
chunk = deduped[i:i + chunk_size]
placeholders = ",".join(["%s"] * len(chunk))
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
cursor.execute(sql, [domain, *chunk])
for row in cursor.fetchall():
existing.add(str(row[0]))
cursor.close()
return existing
def _insert_lawyer_rows(rows: List[Dict[str, Any]], domain: str) -> Tuple[int, int]:
if not rows:
return 0, 0
def row_score(item: Dict[str, Any]) -> int:
score = 0
if str(item.get("name") or "").strip():
score += 5
if str(item.get("law_firm") or "").strip():
score += 3
if str(item.get("url") or "").strip():
score += 1
if str(item.get("province") or "").strip() or str(item.get("city") or "").strip():
score += 1
phone_count_in_node = _parse_int(item.get("phone_count_in_node"), 1)
if phone_count_in_node > 1:
score -= (phone_count_in_node - 1)
return score
deduped_by_phone: Dict[str, Dict[str, Any]] = {}
skipped = 0
for row in rows:
phone = str(row.get("phone") or "").strip()
if not phone:
skipped += 1
continue
old_row = deduped_by_phone.get(phone)
if old_row is not None:
if row_score(row) > row_score(old_row):
deduped_by_phone[phone] = row
skipped += 1
continue
deduped_by_phone[phone] = row
existing = _existing_phones(domain, list(deduped_by_phone.keys()))
inserted = 0
with Db() as db:
cursor = db.db.cursor()
sql = (
"INSERT INTO lawyer "
"(name, phone, law_firm, province, city, url, domain, create_time, site_time, params) "
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
)
for phone, row in deduped_by_phone.items():
if phone in existing:
skipped += 1
continue
cursor.execute(
sql,
(
row.get("name") or "",
phone,
row.get("law_firm") or "",
row.get("province") or "",
row.get("city") or "",
row.get("url") or "",
domain,
_parse_int(row.get("create_time"), int(time.time())),
_parse_int(row.get("site_time"), int(time.time())),
row.get("params") or "{}",
),
)
inserted += 1
existing.add(phone)
db.db.commit()
cursor.close()
return inserted, skipped
def _extract_lawyer_rows_from_payload(
payload: Dict[str, Any],
area_table: str,
area_domain: str,
save_domain: str,
) -> List[Dict[str, Any]]:
now_ts = int(time.time())
api_url = str(payload.get("url") or "").strip()
city_index = _parse_int(payload.get("cityIndex"), -1)
city_province, city_name = _city_from_index(city_index, area_table, area_domain)
rows: List[Dict[str, Any]] = []
data = payload.get("data") if isinstance(payload, dict) else None
user_list = data.get("user_list") if isinstance(data, dict) else None
if not isinstance(user_list, list):
return rows
for user_item in user_list:
if not isinstance(user_item, dict):
continue
user_info = user_item.get("user_info")
if not isinstance(user_info, dict):
continue
phones = _extract_phones_from_user_info(user_info)
if not phones:
continue
name = str(user_info.get("nickname") or "").strip()
law_firm = _extract_law_firm_from_user_info(user_info)
sec_uid = str(user_info.get("sec_uid") or "").strip()
url = f"https://www.douyin.com/user/{sec_uid}" if sec_uid else api_url
province = city_province
city = city_name or city_province
source_record = {
"source": "douyin",
"api_source": payload.get("source") or "",
"api_url": api_url,
"city_index": city_index,
"captured_at": now_ts,
"sec_uid": sec_uid,
"user_info": {
"uid": user_info.get("uid"),
"nickname": user_info.get("nickname"),
"signature": user_info.get("signature"),
"unique_id": user_info.get("unique_id"),
"versatile_display": user_info.get("versatile_display"),
},
}
for phone in phones:
rows.append(
{
"name": name,
"phone": phone,
"law_firm": law_firm,
"province": province,
"city": city,
"url": url,
"domain": save_domain,
"create_time": now_ts,
"site_time": _parse_int(payload.get("ts"), now_ts),
"phone_count_in_node": len(phones),
"params": json.dumps(source_record, ensure_ascii=False),
}
)
return rows
class AreaSyncHandler(BaseHTTPRequestHandler):
server_version = "AreaSyncService/2.0"
def _write_json(self, status: int, payload: Any) -> None:
body = json.dumps(payload, ensure_ascii=False).encode("utf-8")
self.send_response(status)
self.send_header("Content-Type", "application/json; charset=utf-8")
self.send_header("Content-Length", str(len(body)))
self.send_header("Access-Control-Allow-Origin", "*")
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
self.send_header("Access-Control-Allow-Headers", "Content-Type")
self.end_headers()
self.wfile.write(body)
def _read_json_body(self) -> Any:
length = _parse_int(self.headers.get("Content-Length"), 0)
if length <= 0:
return {}
raw = self.rfile.read(length)
if not raw:
return {}
try:
return json.loads(raw.decode("utf-8"))
except Exception:
return {}
def do_OPTIONS(self) -> None:
self.send_response(204)
self.send_header("Access-Control-Allow-Origin", "*")
self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
self.send_header("Access-Control-Allow-Headers", "Content-Type")
self.end_headers()
def do_GET(self) -> None:
parsed = urlparse(self.path)
params = parse_qs(parsed.query)
if parsed.path == "/health":
self._write_json(200, {"ok": True, "service": "layer-service"})
return
if parsed.path == "/api/layer/get_area":
table_name = _first_param(params, "table", AREA_TABLE).strip() or AREA_TABLE
domain = _first_param(params, "domain", AREA_DOMAIN).strip() or AREA_DOMAIN
with_meta = _parse_bool(_first_param(params, "meta", "0"), False)
try:
rows = _query_area_data(table_name, domain)
except Exception as exc:
self._write_json(500, {"ok": False, "error": str(exc)})
return
if with_meta:
self._write_json(
200,
{
"ok": True,
"count": len(rows),
"table": table_name,
"domain": domain,
"data": rows,
},
)
else:
self._write_json(200, rows)
return
self._write_json(404, {"ok": False, "error": "not found"})
def do_POST(self) -> None:
parsed = urlparse(self.path)
params = parse_qs(parsed.query)
if parsed.path == "/api/layer/index":
body = self._read_json_body()
if not isinstance(body, dict) or not body:
self._write_json(400, {"ok": False, "error": "invalid json body"})
return
area_table = _first_param(params, "table", AREA_TABLE).strip() or AREA_TABLE
area_domain = _first_param(params, "area_domain", AREA_DOMAIN).strip() or AREA_DOMAIN
save_domain = _first_param(params, "save_domain", DOUYIN_DOMAIN).strip() or DOUYIN_DOMAIN
save_only_default = _parse_bool(DOUYIN_SAVE_ONLY_ENV, True)
save_only = _parse_bool(_first_param(params, "save_only", DOUYIN_SAVE_ONLY_ENV), save_only_default)
try:
saved_file = _save_raw_index_payload(body, params, self.client_address[0] if self.client_address else "")
except Exception as exc:
self._write_json(500, {"ok": False, "error": f"save raw payload failed: {exc}"})
return
if save_only:
self._write_json(
200,
{
"ok": True,
"message": "saved_only",
"save_only": True,
"saved_file": saved_file,
},
)
return
try:
extracted = _extract_lawyer_rows_from_payload(body, area_table, area_domain, save_domain)
inserted, skipped = _insert_lawyer_rows(extracted, save_domain)
except Exception as exc:
self._write_json(500, {"ok": False, "error": str(exc)})
return
self._write_json(
200,
{
"ok": True,
"message": "received",
"save_domain": save_domain,
"save_only": False,
"saved_file": saved_file,
"extracted": len(extracted),
"inserted": inserted,
"skipped": skipped,
},
)
return
self._write_json(404, {"ok": False, "error": "not found"})
def run() -> None:
server = ThreadingHTTPServer((SERVICE_HOST, SERVICE_PORT), AreaSyncHandler)
print(f"[layer-service] running on http://{SERVICE_HOST}:{SERVICE_PORT}")
print(f"[layer-service] get_area -> table/domain: {AREA_TABLE}/{AREA_DOMAIN}")
print(f"[layer-service] index -> save domain: {DOUYIN_DOMAIN}")
server.serve_forever()
if __name__ == "__main__":
run()