feat: add shared progress API and resume/skip support for douyin batch

This commit is contained in:
hello-dd-code
2026-03-07 01:06:40 +08:00
parent 86cf933913
commit e10437cd90
3 changed files with 594 additions and 19 deletions
+19 -1
View File
@@ -29,12 +29,14 @@ python3 -m venv .venv
- 替代原 `nas.nepiedg.site:9002` 的核心接口 - 替代原 `nas.nepiedg.site:9002` 的核心接口
- `GET /api/layer/get_area`:从数据库 `area_new` 读取地区列表并返回给 `js/douyin.js` - `GET /api/layer/get_area`:从数据库 `area_new` 读取地区列表并返回给 `js/douyin.js`
- `POST /api/layer/index`:接收脚本回传搜索数据,先保存原始 JSON 到本地,再按参数决定是否入库 - `POST /api/layer/index`:接收脚本回传搜索数据,先保存原始 JSON 到本地,再按参数决定是否入库
- `GET/POST /api/layer/progress`:多设备共享采集断点(自动建表 `layer_progress`
`/api/layer/index` 当前入库规则(基于 `payload.data.user_list[].user_info`): `/api/layer/index` 当前入库规则(基于 `payload.data.user_list[].user_info`):
- 主要从 `signature`(简介)里正则提取手机号 - 主要从 `signature`(简介)里正则提取手机号
- 若简介未命中,再从微信相关标记(`微信/wx/vx/v`)和 `unique_id/versatile_display` 提取手机号 - 若简介未命中,再从微信相关标记(`微信/wx/vx/v`)和 `unique_id/versatile_display` 提取手机号
- `url` 固定写为 `https://www.douyin.com/user/{sec_uid}``sec_uid` 为空时回退接口 URL - 必须命中关键词(默认:`律师,律所`)才允许入库,可通过 `DOUYIN_LAWYER_KEYWORDS` 调整
- `url` 固定写为 `https://www.douyin.com/user/{sec_uid}``sec_uid` 为空则跳过不入库)
启动: 启动:
@@ -53,6 +55,9 @@ AREA_DOMAIN=maxlaw
DOUYIN_DOMAIN=抖音 DOUYIN_DOMAIN=抖音
DOUYIN_RAW_DIR=/www/wwwroot/lawyers/data/douyin_raw DOUYIN_RAW_DIR=/www/wwwroot/lawyers/data/douyin_raw
DOUYIN_SAVE_ONLY=1 DOUYIN_SAVE_ONLY=1
DOUYIN_LAWYER_KEYWORDS=律师,律所
LAYER_PROGRESS_TABLE=layer_progress
LAYER_PROGRESS_DEFAULT_KEY=douyin_batch_default
``` ```
接口示例: 接口示例:
@@ -84,6 +89,19 @@ curl -X POST 'http://127.0.0.1:9002/api/layer/index?save_only=1' \
# 原始数据落盘目录(按天分文件) # 原始数据落盘目录(按天分文件)
# /www/wwwroot/lawyers/data/douyin_raw/douyin_index_YYYYMMDD.jsonl # /www/wwwroot/lawyers/data/douyin_raw/douyin_index_YYYYMMDD.jsonl
# 读取共享断点(多设备)
curl 'http://127.0.0.1:9002/api/layer/progress?server=1&progress_key=douyin_batch_default'
# 更新共享断点
curl -X POST 'http://127.0.0.1:9002/api/layer/progress?server=1' \
-H 'Content-Type: application/json' \
-d '{"progress_key":"douyin_batch_default","device_id":"device-a","next_city_index":120,"area_signature":"xxxx","area_total":551,"current_city":"北京","reason":"city_done","status":"running"}'
# 清空共享断点
curl -X POST 'http://127.0.0.1:9002/api/layer/progress?server=1' \
-H 'Content-Type: application/json' \
-d '{"action":"clear","progress_key":"douyin_batch_default"}'
``` ```
如果 9002 端口已有旧进程占用,可先执行: 如果 9002 端口已有旧进程占用,可先执行:
+310 -13
View File
@@ -1,7 +1,7 @@
// ==UserScript== // ==UserScript==
// @name Douyin Batch City Search + AutoScroll + Capture // @name Douyin Batch City Search + AutoScroll + Capture
// @namespace http://tampermonkey.net/ // @namespace http://tampermonkey.net/
// @version 1.0 // @version 1.1
// @description 从 Python 服务获取地区列表,按 city + "律师" 搜索并自动下滑,拦截 /aweme/v1/web/discover/search/ 返回并转发到入库接口。 // @description 从 Python 服务获取地区列表,按 city + "律师" 搜索并自动下滑,拦截 /aweme/v1/web/discover/search/ 返回并转发到入库接口。
// @author You // @author You
// @match https://www.douyin.com/* // @match https://www.douyin.com/*
@@ -43,6 +43,13 @@
const WAIT_AFTER_SEARCH_MS = 1000; const WAIT_AFTER_SEARCH_MS = 1000;
const DELAY_BETWEEN_CITIES_MS = 1500; const DELAY_BETWEEN_CITIES_MS = 1500;
// 断点续跑配置
const PROGRESS_STORAGE_KEY = 'dm_batch_progress_v1';
const DEVICE_ID_STORAGE_KEY = 'dm_batch_device_id_v1';
const PROGRESS_SYNC_ENABLED = true;
const PROGRESS_KEY = 'douyin_batch_default';
const PROGRESS_API = `${API_BASE}/api/layer/progress?server=1`;
// 可选:如果希望只发送包含手机号的条目,可在此启用并调整正则 // 可选:如果希望只发送包含手机号的条目,可在此启用并调整正则
const ONLY_SEND_IF_HAS_PHONE = false; const ONLY_SEND_IF_HAS_PHONE = false;
const PHONE_REGEX = /(?:\+?86)?1[3-9]\d{9}/g; const PHONE_REGEX = /(?:\+?86)?1[3-9]\d{9}/g;
@@ -50,14 +57,20 @@
/********************* 运行时状态 *********************/ /********************* 运行时状态 *********************/
let areaList = []; let areaList = [];
let stopFlag = false; // 由 UI 控制,true 表示停止整个任务 let stopFlag = false; // 由 UI 控制,true 表示停止整个任务
let skipCurrentCityFlag = false; // 由 UI 控制,true 表示跳过当前城市
let currentCityIndex = -1; let currentCityIndex = -1;
let currentAreaSignature = '';
let isLoopRunning = false;
let inputEl = null; let inputEl = null;
let btnEl = null; let btnEl = null;
const DEVICE_ID = getOrCreateDeviceId();
// 节流/去重发送 // 节流/去重发送
let lastSentHash = null; let lastSentHash = null;
let lastSentAt = 0; let lastSentAt = 0;
const SEND_MIN_INTERVAL_MS = 800; const SEND_MIN_INTERVAL_MS = 800;
let progressSyncInFlight = false;
let progressSyncPendingPayload = null;
/********************* 工具函数 *********************/ /********************* 工具函数 *********************/
function log(...args) { console.log('[DouyinBatch] ', ...args); } function log(...args) { console.log('[DouyinBatch] ', ...args); }
@@ -72,6 +85,106 @@
return h.toString(16); return h.toString(16);
} }
function sleep(ms) {
return new Promise(r => setTimeout(r, ms));
}
function getOrCreateDeviceId() {
try {
const old = localStorage.getItem(DEVICE_ID_STORAGE_KEY);
if (old) return old;
const generated = (window.crypto && typeof window.crypto.randomUUID === 'function')
? window.crypto.randomUUID()
: `dm-${Date.now()}-${Math.random().toString(16).slice(2, 10)}`;
localStorage.setItem(DEVICE_ID_STORAGE_KEY, generated);
return generated;
} catch (_) {
return `dm-${Date.now()}-${Math.random().toString(16).slice(2, 10)}`;
}
}
function getAreaRowName(row) {
if (!row || typeof row !== 'object') return '';
return String(row.city || row.province || row.name || '').trim();
}
function buildAreaSignature(list) {
try {
if (!Array.isArray(list) || list.length === 0) return 'empty';
const names = list.map(getAreaRowName).filter(Boolean);
return hashString(`${list.length}|${names.join('|')}`);
} catch (e) {
return 'unknown';
}
}
function readProgress() {
try {
const raw = localStorage.getItem(PROGRESS_STORAGE_KEY);
if (!raw) return null;
const parsed = JSON.parse(raw);
if (!parsed || typeof parsed !== 'object') return null;
return parsed;
} catch (_) {
return null;
}
}
function buildProgressPayload(nextCityIndex, reason = '') {
const safeIndex = Number.isFinite(nextCityIndex) ? Math.max(0, Math.floor(nextCityIndex)) : 0;
const currentArea = areaList[safeIndex] || areaList[Math.max(0, currentCityIndex)] || {};
return {
progress_key: PROGRESS_KEY,
device_id: DEVICE_ID,
next_city_index: safeIndex,
area_signature: currentAreaSignature || '',
area_total: Array.isArray(areaList) ? areaList.length : 0,
current_city: getAreaRowName(currentArea),
reason,
status: stopFlag ? 'paused' : 'running',
extra: {
path: location.pathname || '',
href: location.href || '',
},
};
}
function persistProgress(nextCityIndex, reason = '') {
try {
const payload = buildProgressPayload(nextCityIndex, reason);
localStorage.setItem(PROGRESS_STORAGE_KEY, JSON.stringify({
nextCityIndex: payload.next_city_index,
areaSignature: payload.area_signature,
reason: payload.reason,
updatedAt: Date.now(),
progressKey: payload.progress_key,
deviceId: payload.device_id,
}));
enqueueRemoteProgressSync(payload);
} catch (e) {
err('保存进度失败', e);
}
}
function restoreProgress(areaSignature, listLength) {
const progress = readProgress();
if (!progress) return 0;
if (!progress.areaSignature || progress.areaSignature !== areaSignature) return 0;
const idx = Number.isFinite(progress.nextCityIndex) ? Math.floor(progress.nextCityIndex) : 0;
if (idx < 0 || idx >= listLength) return 0;
return idx;
}
function clearProgress() {
try { localStorage.removeItem(PROGRESS_STORAGE_KEY); } catch (_) {}
enqueueRemoteProgressSync({
action: 'clear',
progress_key: PROGRESS_KEY,
device_id: DEVICE_ID,
});
}
function gmGetJson(url) { function gmGetJson(url) {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
GM_xmlhttpRequest({ GM_xmlhttpRequest({
@@ -90,6 +203,76 @@
}); });
} }
function gmPostJson(url, data) {
return new Promise((resolve, reject) => {
GM_xmlhttpRequest({
method: 'POST',
url,
headers: { 'Content-Type': 'application/json' },
data: JSON.stringify(data || {}),
onload(res) {
try {
const json = JSON.parse(res.responseText || '{}');
resolve(json);
} catch (e) {
reject(e);
}
},
onerror(err) { reject(err); }
});
});
}
function enqueueRemoteProgressSync(payload) {
if (!PROGRESS_SYNC_ENABLED) return;
if (!payload || typeof payload !== 'object') return;
progressSyncPendingPayload = payload;
if (progressSyncInFlight) return;
flushRemoteProgressSync();
}
async function flushRemoteProgressSync() {
if (!PROGRESS_SYNC_ENABLED) return;
if (progressSyncInFlight) return;
progressSyncInFlight = true;
try {
while (progressSyncPendingPayload) {
const payload = progressSyncPendingPayload;
progressSyncPendingPayload = null;
try {
await gmPostJson(PROGRESS_API, payload);
} catch (e) {
err('同步远端进度失败', e);
break;
}
}
} finally {
progressSyncInFlight = false;
}
}
async function restoreRemoteProgress(areaSignature, listLength) {
if (!PROGRESS_SYNC_ENABLED) return 0;
try {
const url = `${PROGRESS_API}&progress_key=${encodeURIComponent(PROGRESS_KEY)}`;
const response = await gmGetJson(url);
const data = response && response.data ? response.data : null;
if (!data || typeof data !== 'object') return 0;
const remoteSignature = String(data.area_signature || '');
if (!remoteSignature || remoteSignature !== areaSignature) return 0;
const idxRaw = data.next_city_index;
const idx = Number.isFinite(idxRaw) ? Math.floor(idxRaw) : Math.floor(Number(idxRaw || 0));
if (!Number.isFinite(idx) || idx < 0 || idx >= listLength) return 0;
return idx;
} catch (e) {
err('读取远端进度失败', e);
return 0;
}
}
function setNativeValue(el, value) { function setNativeValue(el, value) {
if (!el) return; if (!el) return;
const prototype = el.constructor && el.constructor.prototype ? el.constructor.prototype : window.HTMLInputElement && window.HTMLInputElement.prototype; const prototype = el.constructor && el.constructor.prototype ? el.constructor.prototype : window.HTMLInputElement && window.HTMLInputElement.prototype;
@@ -263,6 +446,11 @@
let scrolls = 0; let scrolls = 0;
while (!stopFlag) { while (!stopFlag) {
if (skipCurrentCityFlag) {
statusNode.textContent = '收到跳过指令,结束当前地区滚动。';
break;
}
scrolls++; scrolls++;
if (scrolls > maxScrolls) { if (scrolls > maxScrolls) {
statusNode.textContent = `达到单次搜索最大滚动 ${maxScrolls},停止本次自动下滑。`; statusNode.textContent = `达到单次搜索最大滚动 ${maxScrolls},停止本次自动下滑。`;
@@ -276,7 +464,12 @@
window.scrollTo(0, (document.body.scrollHeight || document.documentElement.scrollHeight)); window.scrollTo(0, (document.body.scrollHeight || document.documentElement.scrollHeight));
} }
await new Promise(r => setTimeout(r, SCROLL_INTERVAL_MS)); await sleep(SCROLL_INTERVAL_MS);
if (skipCurrentCityFlag) {
statusNode.textContent = '收到跳过指令,结束当前地区滚动。';
break;
}
const curHeight = document.body.scrollHeight || document.documentElement.scrollHeight || 0; const curHeight = document.body.scrollHeight || document.documentElement.scrollHeight || 0;
if (curHeight === lastHeight) { if (curHeight === lastHeight) {
@@ -391,7 +584,9 @@
const css = ` const css = `
#dm-batch-btn { position: fixed; right: 12px; bottom: 12px; z-index:999999; background: rgba(0,0,0,0.65); color:#fff; #dm-batch-btn { position: fixed; right: 12px; bottom: 12px; z-index:999999; background: rgba(0,0,0,0.65); color:#fff;
padding:8px 10px; border-radius:8px; font-size:13px; cursor:pointer; user-select:none;} padding:8px 10px; border-radius:8px; font-size:13px; cursor:pointer; user-select:none;}
#dm-batch-status { position: fixed; right:12px; bottom:56px; z-index:999999; background: rgba(0,0,0,0.45); color:#fff; #dm-batch-skip { position: fixed; right:12px; bottom:50px; z-index:999999; background: rgba(30,30,30,0.72); color:#fff;
padding:7px 10px; border-radius:8px; font-size:12px; cursor:pointer; user-select:none;}
#dm-batch-status { position: fixed; right:12px; bottom:88px; z-index:999999; background: rgba(0,0,0,0.45); color:#fff;
padding:6px 8px; border-radius:6px; font-size:12px; max-width:320px; word-break:break-word;} padding:6px 8px; border-radius:6px; font-size:12px; max-width:320px; word-break:break-word;}
`; `;
const s = document.createElement('style'); s.textContent = css; document.head && document.head.appendChild(s); const s = document.createElement('style'); s.textContent = css; document.head && document.head.appendChild(s);
@@ -402,6 +597,11 @@
btn.dataset.running = '1'; btn.dataset.running = '1';
document.body.appendChild(btn); document.body.appendChild(btn);
const skipBtn = document.createElement('div');
skipBtn.id = 'dm-batch-skip';
skipBtn.textContent = 'BatchSearch:跳过当前';
document.body.appendChild(skipBtn);
const status = document.createElement('div'); const status = document.createElement('div');
status.id = 'dm-batch-status'; status.id = 'dm-batch-status';
status.textContent = '准备中...'; status.textContent = '准备中...';
@@ -411,21 +611,55 @@
const running = btn.dataset.running === '1'; const running = btn.dataset.running === '1';
btn.dataset.running = running ? '0' : '1'; btn.dataset.running = running ? '0' : '1';
btn.textContent = running ? 'BatchSearch:已停止' : 'BatchSearch:停止'; btn.textContent = running ? 'BatchSearch:已停止' : 'BatchSearch:停止';
status.textContent = running ? '已手动停止' : '已启动'; status.textContent = running ? '已手动停止(已保存断点)' : '已启动';
stopFlag = running; // if was running and clicked -> set stopFlag true; if restarting, set false stopFlag = running; // if was running and clicked -> set stopFlag true; if restarting, set false
if (running) {
skipCurrentCityFlag = false;
persistProgress(Math.max(currentCityIndex, 0), 'manual_pause');
}
if (!stopFlag) { if (!stopFlag) {
// restart loop if needed // restart loop if needed
runBatchSearchLoop(status).catch(e => err(e)); runBatchSearchLoop(status).catch(e => err(e));
} }
}); });
return { btn, status }; skipBtn.addEventListener('click', () => {
if (currentCityIndex < 0) {
status.textContent = '当前还未开始处理城市,稍后再跳过。';
return;
}
skipCurrentCityFlag = true;
const areaName = getAreaRowName(areaList[currentCityIndex] || {});
status.textContent = `收到跳过指令:${areaName || `索引${currentCityIndex}`}`;
});
skipBtn.addEventListener('contextmenu', (event) => {
event.preventDefault();
clearProgress();
currentCityIndex = 0;
status.textContent = '已清除断点。下次将从第 1 个地区开始。';
});
return { btn, skipBtn, status };
} }
/********************* 主流程:获取城市并循环搜索 *********************/ /********************* 主流程:获取城市并循环搜索 *********************/
async function runBatchSearchLoop(statusNode) { async function runBatchSearchLoop(statusNode) {
if (isLoopRunning) {
statusNode.textContent = '批量任务已在运行中,请勿重复启动。';
return;
}
isLoopRunning = true;
try { try {
stopFlag = (document.getElementById('dm-batch-btn') && document.getElementById('dm-batch-btn').dataset.running === '0'); stopFlag = (document.getElementById('dm-batch-btn') && document.getElementById('dm-batch-btn').dataset.running === '0');
skipCurrentCityFlag = false;
if (stopFlag) {
statusNode.textContent = '当前是暂停状态,点击“BatchSearch:停止”可继续。';
return;
}
// 获取 area list(仅在内存为空时获取) // 获取 area list(仅在内存为空时获取)
if (!areaList || !Array.isArray(areaList) || areaList.length === 0) { if (!areaList || !Array.isArray(areaList) || areaList.length === 0) {
statusNode.textContent = '正在获取城市列表...'; statusNode.textContent = '正在获取城市列表...';
@@ -451,6 +685,20 @@
} }
} }
currentAreaSignature = buildAreaSignature(areaList);
const restoredIndexLocal = restoreProgress(currentAreaSignature, areaList.length);
const restoredIndexRemote = await restoreRemoteProgress(currentAreaSignature, areaList.length);
const restoredIndex = Math.max(restoredIndexLocal, restoredIndexRemote);
const startIndex = (currentCityIndex >= 0 && currentCityIndex < areaList.length)
? currentCityIndex
: restoredIndex;
currentCityIndex = startIndex;
if (startIndex > 0) {
statusNode.textContent = `检测到断点(本地:${restoredIndexLocal + 1} 远端:${restoredIndexRemote + 1}),将从第 ${startIndex + 1}/${areaList.length} 个地区继续。`;
await sleep(500);
}
// 等待搜索输入与按钮可用 // 等待搜索输入与按钮可用
try { try {
await ensureSearchControls(statusNode); await ensureSearchControls(statusNode);
@@ -460,12 +708,27 @@
return; return;
} }
let completedAll = true;
// 主循环:对每个 city 执行搜索 -> 下滑 -> 发送结果 -> 下一 city // 主循环:对每个 city 执行搜索 -> 下滑 -> 发送结果 -> 下一 city
for (let i = 0; i < areaList.length; i++) { for (let i = startIndex; i < areaList.length; i++) {
if (stopFlag) { statusNode.textContent = '已停止'; break; } if (stopFlag) {
completedAll = false;
persistProgress(i, 'manual_stop');
statusNode.textContent = '已停止(断点已保存)。';
break;
}
currentCityIndex = i; currentCityIndex = i;
skipCurrentCityFlag = false;
persistProgress(i, 'start_city');
const city = (areaList[i].city || areaList[i].province || '').trim(); const city = (areaList[i].city || areaList[i].province || '').trim();
if (!city) continue; if (!city) {
persistProgress(i + 1, 'empty_city');
continue;
}
const keyword = `${city}律师`; const keyword = `${city}律师`;
statusNode.textContent = `正在搜索:${keyword} ${i+1}/${areaList.length}`; statusNode.textContent = `正在搜索:${keyword} ${i+1}/${areaList.length}`;
log(`开始城市[${i+1}/${areaList.length}] 搜索:`, keyword); log(`开始城市[${i+1}/${areaList.length}] 搜索:`, keyword);
@@ -476,6 +739,8 @@
} catch (e) { } catch (e) {
err('刷新搜索控件失败', e); err('刷新搜索控件失败', e);
statusNode.textContent = '刷新搜索控件失败,终止批量搜索。'; statusNode.textContent = '刷新搜索控件失败,终止批量搜索。';
completedAll = false;
persistProgress(i, 'search_control_error');
break; break;
} }
@@ -488,6 +753,8 @@
await ensureSearchControls(statusNode); await ensureSearchControls(statusNode);
if (!simulateSearchTrigger()) { if (!simulateSearchTrigger()) {
statusNode.textContent = '搜索触发失败,终止批量搜索。'; statusNode.textContent = '搜索触发失败,终止批量搜索。';
completedAll = false;
persistProgress(i, 'search_trigger_error');
break; break;
} }
} }
@@ -498,24 +765,54 @@
// 自动下滑直到稳定或达到上限 // 自动下滑直到稳定或达到上限
await autoScrollUntilStable(statusNode, MAX_SCROLLS_PER_CITY); await autoScrollUntilStable(statusNode, MAX_SCROLLS_PER_CITY);
if (stopFlag) { statusNode.textContent = '已停止'; break; } if (skipCurrentCityFlag) {
skipCurrentCityFlag = false;
persistProgress(i + 1, 'skip_city');
statusNode.textContent = `已跳过 ${keyword},继续下一个地区...`;
await sleep(Math.min(DELAY_BETWEEN_CITIES_MS, 800));
continue;
}
if (stopFlag) {
completedAll = false;
persistProgress(i, 'manual_stop_after_scroll');
statusNode.textContent = '已停止(断点已保存)。';
break;
}
persistProgress(i + 1, 'city_done');
// 等待短暂间隔再进行下一个城市 // 等待短暂间隔再进行下一个城市
statusNode.textContent = `完成 ${keyword} 的加载,等待 ${DELAY_BETWEEN_CITIES_MS} ms 后继续...`; statusNode.textContent = `完成 ${keyword} 的加载,等待 ${DELAY_BETWEEN_CITIES_MS} ms 后继续...`;
await new Promise(r => setTimeout(r, DELAY_BETWEEN_CITIES_MS)); await sleep(DELAY_BETWEEN_CITIES_MS);
} }
statusNode.textContent = '批量搜索完成或已停止。'; if (completedAll && !stopFlag) {
log('批量搜索循环结束'); clearProgress();
currentCityIndex = -1;
statusNode.textContent = '批量搜索完成,已清除断点进度。';
log('批量搜索循环结束: completed');
} else {
log('批量搜索循环结束: paused/broken');
}
} catch (e) { } catch (e) {
err('runBatchSearchLoop error', e); err('runBatchSearchLoop error', e);
persistProgress(Math.max(currentCityIndex, 0), 'loop_exception');
} finally {
isLoopRunning = false;
} }
} }
/********************* 启动脚本 *********************/ /********************* 启动脚本 *********************/
(function init() { (function init() {
window.addEventListener('beforeunload', () => {
if (currentCityIndex >= 0) {
persistProgress(Math.max(currentCityIndex, 0), 'page_unload');
}
});
const ui = createUI(); const ui = createUI();
ui.status.textContent = '就绪 - 点击右下按钮可停止/重启批量搜索'; ui.status.textContent = '就绪 - 可暂停/跳过,自动保存断点(右键跳过按钮可清除断点)';
console.log(location.pathname) console.log(location.pathname)
// 如果当前为目标页面(/jingxuan/search/),则自动启动;否则仍可在任何页面打开并手动启动。 // 如果当前为目标页面(/jingxuan/search/),则自动启动;否则仍可在任何页面打开并手动启动。
const isAutoPage = location.pathname && location.pathname.indexOf('/search/') !== -1; const isAutoPage = location.pathname && location.pathname.indexOf('/search/') !== -1;
+264 -4
View File
@@ -5,7 +5,7 @@ import sys
import threading import threading
import time import time
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from typing import Any, Dict, Iterable, List, Set, Tuple from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
from urllib.parse import parse_qs, urlparse from urllib.parse import parse_qs, urlparse
current_dir = os.path.dirname(os.path.abspath(__file__)) current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -21,6 +21,9 @@ AREA_DOMAIN = os.getenv("AREA_DOMAIN", "maxlaw")
DOUYIN_DOMAIN = os.getenv("DOUYIN_DOMAIN", "抖音") DOUYIN_DOMAIN = os.getenv("DOUYIN_DOMAIN", "抖音")
DOUYIN_RAW_DIR = os.getenv("DOUYIN_RAW_DIR", os.path.join(project_root, "data", "douyin_raw")) DOUYIN_RAW_DIR = os.getenv("DOUYIN_RAW_DIR", os.path.join(project_root, "data", "douyin_raw"))
DOUYIN_SAVE_ONLY_ENV = os.getenv("DOUYIN_SAVE_ONLY", "1") DOUYIN_SAVE_ONLY_ENV = os.getenv("DOUYIN_SAVE_ONLY", "1")
LAWYER_KEYWORDS_ENV = os.getenv("DOUYIN_LAWYER_KEYWORDS", "律师,律所")
PROGRESS_TABLE = os.getenv("LAYER_PROGRESS_TABLE", "layer_progress")
PROGRESS_DEFAULT_KEY = os.getenv("LAYER_PROGRESS_DEFAULT_KEY", "douyin_batch_default")
SERVICE_HOST = os.getenv("AREA_SERVICE_HOST", "0.0.0.0") SERVICE_HOST = os.getenv("AREA_SERVICE_HOST", "0.0.0.0")
SERVICE_PORT = int(os.getenv("AREA_SERVICE_PORT", "9002")) SERVICE_PORT = int(os.getenv("AREA_SERVICE_PORT", "9002"))
@@ -29,6 +32,10 @@ WX_CONTEXT_REGEX = re.compile(r"(?i)(?:微信|微.?信|wx|vx|weixin|v信|v号|v)
LAW_FIRM_REGEX = re.compile(r"([\u4e00-\u9fa5A-Za-z·]{2,40}律师事务所)") LAW_FIRM_REGEX = re.compile(r"([\u4e00-\u9fa5A-Za-z·]{2,40}律师事务所)")
RAW_WRITE_LOCK = threading.Lock() RAW_WRITE_LOCK = threading.Lock()
LAWYER_KEYWORDS: Tuple[str, ...] = tuple(
keyword.strip() for keyword in LAWYER_KEYWORDS_ENV.split(",") if keyword.strip()
)
def _is_safe_table_name(table_name: str) -> bool: def _is_safe_table_name(table_name: str) -> bool:
return bool(re.fullmatch(r"[A-Za-z0-9_]+", table_name or "")) return bool(re.fullmatch(r"[A-Za-z0-9_]+", table_name or ""))
@@ -83,6 +90,161 @@ def _save_raw_index_payload(payload: Dict[str, Any], query: Dict[str, List[str]]
return file_path return file_path
def _ensure_progress_table() -> None:
if not _is_safe_table_name(PROGRESS_TABLE):
raise ValueError("非法进度表名")
with Db() as db:
cursor = db.db.cursor()
sql = f"""
CREATE TABLE IF NOT EXISTS `{PROGRESS_TABLE}` (
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
`progress_key` varchar(128) NOT NULL,
`next_city_index` int(11) DEFAULT 0,
`area_signature` varchar(128) DEFAULT NULL,
`area_total` int(11) DEFAULT 0,
`current_city` varchar(128) DEFAULT NULL,
`reason` varchar(64) DEFAULT NULL,
`status` varchar(32) DEFAULT NULL,
`device_id` varchar(128) DEFAULT NULL,
`extra_json` longtext,
`updated_at` bigint(20) DEFAULT NULL,
`create_time` bigint(20) DEFAULT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `uk_progress_key` (`progress_key`),
KEY `idx_updated_at` (`updated_at`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
"""
cursor.execute(sql)
db.db.commit()
cursor.close()
def _get_progress(progress_key: str) -> Optional[Dict[str, Any]]:
key = str(progress_key or "").strip()
if not key:
return None
_ensure_progress_table()
with Db() as db:
cursor = db.db.cursor()
sql = (
f"SELECT progress_key, next_city_index, area_signature, area_total, current_city, "
f"reason, status, device_id, extra_json, updated_at, create_time "
f"FROM `{PROGRESS_TABLE}` WHERE progress_key=%s LIMIT 1"
)
cursor.execute(sql, (key,))
row = cursor.fetchone()
cursor.close()
if not row:
return None
extra_json = row[8] or ""
extra_obj: Any = {}
if extra_json:
try:
extra_obj = json.loads(extra_json)
except Exception:
extra_obj = extra_json
return {
"progress_key": row[0] or "",
"next_city_index": _parse_int(row[1], 0),
"area_signature": row[2] or "",
"area_total": _parse_int(row[3], 0),
"current_city": row[4] or "",
"reason": row[5] or "",
"status": row[6] or "",
"device_id": row[7] or "",
"extra": extra_obj,
"updated_at": _parse_int(row[9], 0),
"create_time": _parse_int(row[10], 0),
}
def _upsert_progress(progress_key: str, payload: Dict[str, Any]) -> Dict[str, Any]:
key = str(progress_key or "").strip()
if not key:
raise ValueError("progress_key 不能为空")
_ensure_progress_table()
now_ts = int(time.time())
next_city_index = _parse_int(payload.get("next_city_index"), 0)
area_signature = str(payload.get("area_signature") or "").strip()
area_total = _parse_int(payload.get("area_total"), 0)
current_city = str(payload.get("current_city") or "").strip()
reason = str(payload.get("reason") or "").strip()
status = str(payload.get("status") or "").strip()
device_id = str(payload.get("device_id") or "").strip()
extra = payload.get("extra")
if extra is None:
extra = payload.get("extra_json")
if isinstance(extra, str):
extra_json = extra
else:
try:
extra_json = json.dumps(extra or {}, ensure_ascii=False)
except Exception:
extra_json = "{}"
with Db() as db:
cursor = db.db.cursor()
sql = (
f"INSERT INTO `{PROGRESS_TABLE}` "
"(progress_key, next_city_index, area_signature, area_total, current_city, reason, status, "
"device_id, extra_json, updated_at, create_time) "
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) "
"ON DUPLICATE KEY UPDATE "
"next_city_index=VALUES(next_city_index), "
"area_signature=VALUES(area_signature), "
"area_total=VALUES(area_total), "
"current_city=VALUES(current_city), "
"reason=VALUES(reason), "
"status=VALUES(status), "
"device_id=VALUES(device_id), "
"extra_json=VALUES(extra_json), "
"updated_at=VALUES(updated_at)"
)
cursor.execute(
sql,
(
key,
next_city_index,
area_signature,
area_total,
current_city,
reason,
status,
device_id,
extra_json,
now_ts,
now_ts,
),
)
db.db.commit()
cursor.close()
return _get_progress(key) or {}
def _clear_progress(progress_key: str) -> int:
key = str(progress_key or "").strip()
if not key:
return 0
_ensure_progress_table()
with Db() as db:
cursor = db.db.cursor()
sql = f"DELETE FROM `{PROGRESS_TABLE}` WHERE progress_key=%s"
cursor.execute(sql, (key,))
affected = cursor.rowcount
db.db.commit()
cursor.close()
return affected
def _query_area_data(table_name: str, domain: str) -> List[Dict[str, Any]]: def _query_area_data(table_name: str, domain: str) -> List[Dict[str, Any]]:
if not _is_safe_table_name(table_name): if not _is_safe_table_name(table_name):
raise ValueError("非法表名") raise ValueError("非法表名")
@@ -193,6 +355,36 @@ def _extract_law_firm_from_user_info(user_info: Dict[str, Any]) -> str:
return "" return ""
def _extract_account_cert_text(user_info: Dict[str, Any]) -> str:
account_cert_info = user_info.get("account_cert_info")
if isinstance(account_cert_info, str) and account_cert_info.strip():
try:
cert_obj = json.loads(account_cert_info)
if isinstance(cert_obj, dict):
return str(cert_obj.get("label_text") or "").strip()
except Exception:
return account_cert_info.strip()
return ""
def _is_lawyer_related_user(user_info: Dict[str, Any], name: str, law_firm: str) -> bool:
texts = [
name,
str(user_info.get("nickname") or ""),
str(user_info.get("signature") or ""),
str(user_info.get("custom_verify") or ""),
str(user_info.get("enterprise_verify_reason") or ""),
str(user_info.get("versatile_display") or ""),
str(user_info.get("unique_id") or ""),
_extract_account_cert_text(user_info),
law_firm,
]
merged = "\n".join(text for text in texts if text).strip()
if not merged:
return False
return any(keyword in merged for keyword in LAWYER_KEYWORDS)
def _pick_first_str(node: Dict[str, Any], keys: Tuple[str, ...]) -> str: def _pick_first_str(node: Dict[str, Any], keys: Tuple[str, ...]) -> str:
for key in keys: for key in keys:
value = node.get(key) value = node.get(key)
@@ -399,14 +591,21 @@ def _extract_lawyer_rows_from_payload(
if not isinstance(user_info, dict): if not isinstance(user_info, dict):
continue continue
name = str(user_info.get("nickname") or "").strip()
law_firm = _extract_law_firm_from_user_info(user_info)
# 强约束:必须出现“律师/律所”等关键词,避免非法律相关账号入库
if not _is_lawyer_related_user(user_info, name, law_firm):
continue
phones = _extract_phones_from_user_info(user_info) phones = _extract_phones_from_user_info(user_info)
if not phones: if not phones:
continue continue
name = str(user_info.get("nickname") or "").strip()
law_firm = _extract_law_firm_from_user_info(user_info)
sec_uid = str(user_info.get("sec_uid") or "").strip() sec_uid = str(user_info.get("sec_uid") or "").strip()
url = f"https://www.douyin.com/user/{sec_uid}" if sec_uid else api_url if not sec_uid:
continue
url = f"https://www.douyin.com/user/{sec_uid}"
province = city_province province = city_province
city = city_name or city_province city = city_name or city_province
@@ -516,12 +715,67 @@ class AreaSyncHandler(BaseHTTPRequestHandler):
self._write_json(200, rows) self._write_json(200, rows)
return return
if parsed.path == "/api/layer/progress":
progress_key = _first_param(params, "progress_key", PROGRESS_DEFAULT_KEY).strip() or PROGRESS_DEFAULT_KEY
try:
row = _get_progress(progress_key)
except Exception as exc:
self._write_json(500, {"ok": False, "error": str(exc)})
return
self._write_json(
200,
{
"ok": True,
"progress_key": progress_key,
"data": row,
},
)
return
self._write_json(404, {"ok": False, "error": "not found"}) self._write_json(404, {"ok": False, "error": "not found"})
def do_POST(self) -> None: def do_POST(self) -> None:
parsed = urlparse(self.path) parsed = urlparse(self.path)
params = parse_qs(parsed.query) params = parse_qs(parsed.query)
if parsed.path == "/api/layer/progress":
body = self._read_json_body()
if not isinstance(body, dict):
body = {}
progress_key = str(body.get("progress_key") or _first_param(params, "progress_key", PROGRESS_DEFAULT_KEY)).strip() or PROGRESS_DEFAULT_KEY
action = str(body.get("action") or _first_param(params, "action", "upsert")).strip().lower() or "upsert"
try:
if action == "clear":
deleted = _clear_progress(progress_key)
self._write_json(
200,
{
"ok": True,
"action": "clear",
"progress_key": progress_key,
"deleted": deleted,
},
)
return
saved = _upsert_progress(progress_key, body)
self._write_json(
200,
{
"ok": True,
"action": "upsert",
"progress_key": progress_key,
"data": saved,
},
)
return
except Exception as exc:
self._write_json(500, {"ok": False, "error": str(exc)})
return
if parsed.path == "/api/layer/index": if parsed.path == "/api/layer/index":
body = self._read_json_body() body = self._read_json_body()
if not isinstance(body, dict) or not body: if not isinstance(body, dict) or not body:
@@ -578,10 +832,16 @@ class AreaSyncHandler(BaseHTTPRequestHandler):
def run() -> None: def run() -> None:
try:
_ensure_progress_table()
except Exception as exc:
print(f"[layer-service] init progress table failed: {exc}")
server = ThreadingHTTPServer((SERVICE_HOST, SERVICE_PORT), AreaSyncHandler) server = ThreadingHTTPServer((SERVICE_HOST, SERVICE_PORT), AreaSyncHandler)
print(f"[layer-service] running on http://{SERVICE_HOST}:{SERVICE_PORT}") print(f"[layer-service] running on http://{SERVICE_HOST}:{SERVICE_PORT}")
print(f"[layer-service] get_area -> table/domain: {AREA_TABLE}/{AREA_DOMAIN}") print(f"[layer-service] get_area -> table/domain: {AREA_TABLE}/{AREA_DOMAIN}")
print(f"[layer-service] index -> save domain: {DOUYIN_DOMAIN}") print(f"[layer-service] index -> save domain: {DOUYIN_DOMAIN}")
print(f"[layer-service] progress table/default key: {PROGRESS_TABLE}/{PROGRESS_DEFAULT_KEY}")
server.serve_forever() server.serve_forever()