import json import os import re import sys import threading import time from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer from typing import Any, Dict, Iterable, List, Set, Tuple from urllib.parse import parse_qs, urlparse current_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.dirname(current_dir) if project_root not in sys.path: sys.path.append(project_root) from Db import Db AREA_TABLE = os.getenv("AREA_TARGET_TABLE", "area_new") AREA_DOMAIN = os.getenv("AREA_DOMAIN", "maxlaw") DOUYIN_DOMAIN = os.getenv("DOUYIN_DOMAIN", "抖音") DOUYIN_RAW_DIR = os.getenv("DOUYIN_RAW_DIR", os.path.join(project_root, "data", "douyin_raw")) DOUYIN_SAVE_ONLY_ENV = os.getenv("DOUYIN_SAVE_ONLY", "1") SERVICE_HOST = os.getenv("AREA_SERVICE_HOST", "0.0.0.0") SERVICE_PORT = int(os.getenv("AREA_SERVICE_PORT", "9002")) PHONE_REGEX = re.compile(r"(?:\+?86[-\s]?)?(1[3-9]\d{9})") WX_CONTEXT_REGEX = re.compile(r"(?i)(?:微信|微.?信|wx|vx|weixin|v信|v号|v)\s*[::/\-\s]\s*([a-zA-Z0-9._-]{3,40})") LAW_FIRM_REGEX = re.compile(r"([\u4e00-\u9fa5A-Za-z·]{2,40}律师事务所)") RAW_WRITE_LOCK = threading.Lock() def _is_safe_table_name(table_name: str) -> bool: return bool(re.fullmatch(r"[A-Za-z0-9_]+", table_name or "")) def _parse_int(value: Any, default: int = 0) -> int: try: return int(str(value).strip()) except Exception: return default def _parse_bool(value: Any, default: bool = False) -> bool: if value is None: return default text = str(value).strip().lower() if text in {"1", "true", "yes", "y", "on"}: return True if text in {"0", "false", "no", "n", "off"}: return False return default def _first_param(params: Dict[str, List[str]], key: str, default: str = "") -> str: values = params.get(key) or [] if not values: return default return values[0] def _append_jsonl(file_path: str, payload: Dict[str, Any]) -> None: os.makedirs(os.path.dirname(file_path) or ".", exist_ok=True) line = json.dumps(payload, ensure_ascii=False) with RAW_WRITE_LOCK: with open(file_path, "a", encoding="utf-8") as out: out.write(line) out.write("\n") def _save_raw_index_payload(payload: Dict[str, Any], query: Dict[str, List[str]], client_ip: str) -> str: now_ts = int(time.time()) day = time.strftime("%Y%m%d", time.localtime(now_ts)) file_path = os.path.join(DOUYIN_RAW_DIR, f"douyin_index_{day}.jsonl") wrapped = { "received_at": now_ts, "client_ip": client_ip, "query": query, "payload": payload, } _append_jsonl(file_path, wrapped) return file_path def _query_area_data(table_name: str, domain: str) -> List[Dict[str, Any]]: if not _is_safe_table_name(table_name): raise ValueError("非法表名") with Db() as db: cursor = db.db.cursor() sql = ( f"SELECT province, city, name, pid, pinyin, code, domain, level, create_time " f"FROM `{table_name}` WHERE domain=%s ORDER BY id ASC" ) cursor.execute(sql, (domain,)) rows = cursor.fetchall() cursor.close() result: List[Dict[str, Any]] = [] for row in rows: result.append( { "province": row[0] or "", "city": row[1] or "", "name": row[2] or "", "pid": row[3] if row[3] is not None else 0, "pinyin": row[4] or "", "code": row[5] or "", "domain": row[6] or "", "level": row[7] if row[7] is not None else 0, "create_time": row[8] if row[8] is not None else 0, } ) return result def _iter_dict_nodes(value: Any) -> Iterable[Dict[str, Any]]: stack: List[Any] = [value] while stack: current = stack.pop() if isinstance(current, dict): yield current stack.extend(current.values()) elif isinstance(current, list): stack.extend(current) def _extract_phones_from_text(text: str) -> List[str]: phones: List[str] = [] seen: Set[str] = set() for match in PHONE_REGEX.finditer(text or ""): phone = match.group(1) if not phone or phone in seen: continue seen.add(phone) phones.append(phone) return phones def _extract_phones_from_user_info(user_info: Dict[str, Any]) -> List[str]: signature = str(user_info.get("signature") or "") unique_id = str(user_info.get("unique_id") or "") versatile = str(user_info.get("versatile_display") or "") # 1) 优先从简介直接匹配手机号 phones = set(_extract_phones_from_text(signature)) if phones: return sorted(phones) # 2) 从微信相关标记中提取,再从抖音号字段兜底 for text in (signature, unique_id, versatile): for match in WX_CONTEXT_REGEX.finditer(text): wx_value = match.group(1) or "" for phone in _extract_phones_from_text(wx_value): phones.add(phone) for text in (unique_id, versatile): for phone in _extract_phones_from_text(text): phones.add(phone) return sorted(phones) def _extract_law_firm_from_user_info(user_info: Dict[str, Any]) -> str: candidates: List[str] = [] signature = str(user_info.get("signature") or "") if signature: candidates.append(signature) verify_reason = str(user_info.get("enterprise_verify_reason") or "") if verify_reason: candidates.append(verify_reason) cert_text = "" account_cert_info = user_info.get("account_cert_info") if isinstance(account_cert_info, str) and account_cert_info.strip(): try: cert_obj = json.loads(account_cert_info) if isinstance(cert_obj, dict): cert_text = str(cert_obj.get("label_text") or "").strip() except Exception: cert_text = account_cert_info.strip() if cert_text: candidates.append(cert_text) for text in candidates: match = LAW_FIRM_REGEX.search(text) if match: return match.group(1) return "" def _pick_first_str(node: Dict[str, Any], keys: Tuple[str, ...]) -> str: for key in keys: value = node.get(key) if isinstance(value, str): text = value.strip() if text: return text return "" def _extract_name(node: Dict[str, Any]) -> str: direct = _pick_first_str(node, ("name", "nickname", "nick_name", "author_name", "title", "account_name")) if direct: return direct for nested_key in ("author", "user", "user_info", "profile", "account"): nested = node.get(nested_key) if isinstance(nested, dict): nested_name = _pick_first_str(nested, ("name", "nickname", "nick_name", "author_name", "title")) if nested_name: return nested_name return "" def _extract_law_firm(node: Dict[str, Any]) -> str: direct = _pick_first_str( node, ( "law_firm", "firm", "lawFirm", "office", "org_name", "organization", "company", "enterprise", ), ) if direct: return direct enterprise = node.get("enterprise") if isinstance(enterprise, dict): company_name = _pick_first_str(enterprise, ("name", "company_name", "enterprise_name")) if company_name: return company_name return "" def _extract_detail_url(node: Dict[str, Any], fallback_api_url: str) -> str: url = _pick_first_str(node, ("share_url", "url", "web_url", "detail_url", "jump_url")) if url: return url aweme_id = node.get("aweme_id") or node.get("item_id") if aweme_id: aid = str(aweme_id).strip() if aid: return f"https://www.douyin.com/video/{aid}" sec_uid = node.get("sec_uid") if sec_uid: sec_uid_text = str(sec_uid).strip() if sec_uid_text: return f"https://www.douyin.com/user/{sec_uid_text}" return fallback_api_url def _city_from_index(city_index: int, table_name: str, domain: str) -> Tuple[str, str]: if city_index < 0: return "", "" try: areas = _query_area_data(table_name, domain) except Exception: return "", "" if city_index >= len(areas): return "", "" area = areas[city_index] province = str(area.get("province") or "").strip() city = str(area.get("city") or province).strip() return province, city def _existing_phones(domain: str, phones: List[str]) -> Set[str]: if not phones: return set() deduped = sorted({p for p in phones if p}) if not deduped: return set() existing: Set[str] = set() with Db() as db: cursor = db.db.cursor() chunk_size = 500 for i in range(0, len(deduped), chunk_size): chunk = deduped[i:i + chunk_size] placeholders = ",".join(["%s"] * len(chunk)) sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})" cursor.execute(sql, [domain, *chunk]) for row in cursor.fetchall(): existing.add(str(row[0])) cursor.close() return existing def _insert_lawyer_rows(rows: List[Dict[str, Any]], domain: str) -> Tuple[int, int]: if not rows: return 0, 0 def row_score(item: Dict[str, Any]) -> int: score = 0 if str(item.get("name") or "").strip(): score += 5 if str(item.get("law_firm") or "").strip(): score += 3 if str(item.get("url") or "").strip(): score += 1 if str(item.get("province") or "").strip() or str(item.get("city") or "").strip(): score += 1 phone_count_in_node = _parse_int(item.get("phone_count_in_node"), 1) if phone_count_in_node > 1: score -= (phone_count_in_node - 1) return score deduped_by_phone: Dict[str, Dict[str, Any]] = {} skipped = 0 for row in rows: phone = str(row.get("phone") or "").strip() if not phone: skipped += 1 continue old_row = deduped_by_phone.get(phone) if old_row is not None: if row_score(row) > row_score(old_row): deduped_by_phone[phone] = row skipped += 1 continue deduped_by_phone[phone] = row existing = _existing_phones(domain, list(deduped_by_phone.keys())) inserted = 0 with Db() as db: cursor = db.db.cursor() sql = ( "INSERT INTO lawyer " "(name, phone, law_firm, province, city, url, domain, create_time, site_time, params) " "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" ) for phone, row in deduped_by_phone.items(): if phone in existing: skipped += 1 continue cursor.execute( sql, ( row.get("name") or "", phone, row.get("law_firm") or "", row.get("province") or "", row.get("city") or "", row.get("url") or "", domain, _parse_int(row.get("create_time"), int(time.time())), _parse_int(row.get("site_time"), int(time.time())), row.get("params") or "{}", ), ) inserted += 1 existing.add(phone) db.db.commit() cursor.close() return inserted, skipped def _extract_lawyer_rows_from_payload( payload: Dict[str, Any], area_table: str, area_domain: str, save_domain: str, ) -> List[Dict[str, Any]]: now_ts = int(time.time()) api_url = str(payload.get("url") or "").strip() city_index = _parse_int(payload.get("cityIndex"), -1) city_province, city_name = _city_from_index(city_index, area_table, area_domain) rows: List[Dict[str, Any]] = [] data = payload.get("data") if isinstance(payload, dict) else None user_list = data.get("user_list") if isinstance(data, dict) else None if not isinstance(user_list, list): return rows for user_item in user_list: if not isinstance(user_item, dict): continue user_info = user_item.get("user_info") if not isinstance(user_info, dict): continue phones = _extract_phones_from_user_info(user_info) if not phones: continue name = str(user_info.get("nickname") or "").strip() law_firm = _extract_law_firm_from_user_info(user_info) sec_uid = str(user_info.get("sec_uid") or "").strip() url = f"https://www.douyin.com/user/{sec_uid}" if sec_uid else api_url province = city_province city = city_name or city_province source_record = { "source": "douyin", "api_source": payload.get("source") or "", "api_url": api_url, "city_index": city_index, "captured_at": now_ts, "sec_uid": sec_uid, "user_info": { "uid": user_info.get("uid"), "nickname": user_info.get("nickname"), "signature": user_info.get("signature"), "unique_id": user_info.get("unique_id"), "versatile_display": user_info.get("versatile_display"), }, } for phone in phones: rows.append( { "name": name, "phone": phone, "law_firm": law_firm, "province": province, "city": city, "url": url, "domain": save_domain, "create_time": now_ts, "site_time": _parse_int(payload.get("ts"), now_ts), "phone_count_in_node": len(phones), "params": json.dumps(source_record, ensure_ascii=False), } ) return rows class AreaSyncHandler(BaseHTTPRequestHandler): server_version = "AreaSyncService/2.0" def _write_json(self, status: int, payload: Any) -> None: body = json.dumps(payload, ensure_ascii=False).encode("utf-8") self.send_response(status) self.send_header("Content-Type", "application/json; charset=utf-8") self.send_header("Content-Length", str(len(body))) self.send_header("Access-Control-Allow-Origin", "*") self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS") self.send_header("Access-Control-Allow-Headers", "Content-Type") self.end_headers() self.wfile.write(body) def _read_json_body(self) -> Any: length = _parse_int(self.headers.get("Content-Length"), 0) if length <= 0: return {} raw = self.rfile.read(length) if not raw: return {} try: return json.loads(raw.decode("utf-8")) except Exception: return {} def do_OPTIONS(self) -> None: self.send_response(204) self.send_header("Access-Control-Allow-Origin", "*") self.send_header("Access-Control-Allow-Methods", "GET, POST, OPTIONS") self.send_header("Access-Control-Allow-Headers", "Content-Type") self.end_headers() def do_GET(self) -> None: parsed = urlparse(self.path) params = parse_qs(parsed.query) if parsed.path == "/health": self._write_json(200, {"ok": True, "service": "layer-service"}) return if parsed.path == "/api/layer/get_area": table_name = _first_param(params, "table", AREA_TABLE).strip() or AREA_TABLE domain = _first_param(params, "domain", AREA_DOMAIN).strip() or AREA_DOMAIN with_meta = _parse_bool(_first_param(params, "meta", "0"), False) try: rows = _query_area_data(table_name, domain) except Exception as exc: self._write_json(500, {"ok": False, "error": str(exc)}) return if with_meta: self._write_json( 200, { "ok": True, "count": len(rows), "table": table_name, "domain": domain, "data": rows, }, ) else: self._write_json(200, rows) return self._write_json(404, {"ok": False, "error": "not found"}) def do_POST(self) -> None: parsed = urlparse(self.path) params = parse_qs(parsed.query) if parsed.path == "/api/layer/index": body = self._read_json_body() if not isinstance(body, dict) or not body: self._write_json(400, {"ok": False, "error": "invalid json body"}) return area_table = _first_param(params, "table", AREA_TABLE).strip() or AREA_TABLE area_domain = _first_param(params, "area_domain", AREA_DOMAIN).strip() or AREA_DOMAIN save_domain = _first_param(params, "save_domain", DOUYIN_DOMAIN).strip() or DOUYIN_DOMAIN save_only_default = _parse_bool(DOUYIN_SAVE_ONLY_ENV, True) save_only = _parse_bool(_first_param(params, "save_only", DOUYIN_SAVE_ONLY_ENV), save_only_default) try: saved_file = _save_raw_index_payload(body, params, self.client_address[0] if self.client_address else "") except Exception as exc: self._write_json(500, {"ok": False, "error": f"save raw payload failed: {exc}"}) return if save_only: self._write_json( 200, { "ok": True, "message": "saved_only", "save_only": True, "saved_file": saved_file, }, ) return try: extracted = _extract_lawyer_rows_from_payload(body, area_table, area_domain, save_domain) inserted, skipped = _insert_lawyer_rows(extracted, save_domain) except Exception as exc: self._write_json(500, {"ok": False, "error": str(exc)}) return self._write_json( 200, { "ok": True, "message": "received", "save_domain": save_domain, "save_only": False, "saved_file": saved_file, "extracted": len(extracted), "inserted": inserted, "skipped": skipped, }, ) return self._write_json(404, {"ok": False, "error": "not found"}) def run() -> None: server = ThreadingHTTPServer((SERVICE_HOST, SERVICE_PORT), AreaSyncHandler) print(f"[layer-service] running on http://{SERVICE_HOST}:{SERVICE_PORT}") print(f"[layer-service] get_area -> table/domain: {AREA_TABLE}/{AREA_DOMAIN}") print(f"[layer-service] index -> save domain: {DOUYIN_DOMAIN}") server.serve_forever() if __name__ == "__main__": run()