mirror of
https://github.com/zc-zhangchen/any-auto-register.git
synced 2026-05-07 07:45:49 +08:00
1123 lines
37 KiB
Python
1123 lines
37 KiB
Python
from __future__ import annotations
|
||
|
||
"""SMSToMe phone pool + OTP helper.
|
||
|
||
该文件是一个**单独的工具脚本**,负责:
|
||
|
||
1. `update_global_phone_list`:抓取多个国家的全部可用手机号,写入本地 txt。
|
||
2. `get_unused_phone`:针对某个任务名,返回一个尚未使用过的手机号。
|
||
3. `wait_for_otp`:轮询该手机号的短信页面,提取验证码。
|
||
|
||
实现细节:
|
||
- 基于 `httpx` + `selectolax` 的 HTTP + HTML 解析方案;
|
||
- 默认使用浏览器风格 UA,禁用系统代理 (`trust_env=False`),避免影响 Tavily 相关代理行为;
|
||
- 支持通过环境变量 `SMSTOME_COOKIE`、仓库根目录 `config.yaml` 或显式参数注入 Cookie;
|
||
- 使用简单的循环 + 退避重试,避免额外引入 tenacity 依赖。
|
||
|
||
注意:
|
||
- txt 持久化仅做简单记录,不做数据库级别的状态管理;
|
||
- 全量号码文件中会额外保存国家 slug 与详情页 URL,方便后续获取验证码;
|
||
- 每个任务的“已使用号码列表”是独立的 txt 文件,仅按手机号一行记录。
|
||
"""
|
||
|
||
import os
|
||
import random
|
||
import re
|
||
import time
|
||
from dataclasses import dataclass
|
||
from pathlib import Path
|
||
from typing import Callable, Dict, Iterable, List, Optional
|
||
|
||
import httpx
|
||
from selectolax.parser import HTMLParser
|
||
from urllib.parse import urljoin, urlsplit
|
||
|
||
try:
|
||
from runtime_support import get_nonempty_str, load_yaml_config
|
||
except ImportError:
|
||
def get_nonempty_str(mapping, *keys):
|
||
data = mapping if isinstance(mapping, dict) else {}
|
||
for key in keys:
|
||
value = str(data.get(key, "") or "").strip()
|
||
if value:
|
||
return value
|
||
return ""
|
||
|
||
def load_yaml_config(config_path):
|
||
path = Path(config_path)
|
||
if not path.exists():
|
||
return {}
|
||
try:
|
||
import yaml
|
||
except ImportError:
|
||
return {}
|
||
try:
|
||
loaded = yaml.safe_load(path.read_text(encoding="utf-8"))
|
||
except Exception:
|
||
return {}
|
||
return loaded if isinstance(loaded, dict) else {}
|
||
|
||
|
||
SMSTOME_BASE_URL = "https://smstome.com"
|
||
DEFAULT_CONFIG_PATH = Path(__file__).with_name("config.yaml")
|
||
|
||
# 当前支持的国家 slug(来自站点 URL)
|
||
DEFAULT_COUNTRY_SLUGS: List[str] = [
|
||
"poland",
|
||
"united-kingdom",
|
||
"slovenia",
|
||
"sweden",
|
||
"finland",
|
||
"belgium",
|
||
]
|
||
|
||
|
||
# 全量号码列表文件(每行:phone\tcountry_slug\tdetail_url)
|
||
GLOBAL_PHONE_FILE = Path("smstome_all_numbers.txt")
|
||
DEFAULT_SYNC_MAX_PAGES_PER_COUNTRY = 5
|
||
|
||
# 每个任务自己的“已使用号码”目录(文件名:<task>_used_numbers.txt)
|
||
USED_NUMBERS_DIR = Path("smstome_used")
|
||
BLACKLISTED_NUMBERS_SUFFIX = "_blacklisted_numbers.txt"
|
||
USED_NUMBERS_SUFFIX = "_used_numbers.txt"
|
||
PHONE_PREFIX_WIDTH = 7
|
||
|
||
|
||
DEFAULT_HEADERS = {
|
||
"User-Agent": (
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||
"Chrome/123.0.0.0 Safari/537.36"
|
||
),
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||
"Accept-Language": "en-US,en;q=0.9",
|
||
"Connection": "keep-alive",
|
||
}
|
||
|
||
OTP_SEPARATOR_CHARS = r"[\s\-]"
|
||
OTP_BIDI_CHARS_RE = re.compile(r"[\u200e\u200f\u202a-\u202e\u2066-\u2069]")
|
||
OTP_SPLIT_CANDIDATE_RE = re.compile(r"(?<!\d)(\d(?:[\s\-]*\d){3,7})(?!\d)")
|
||
DEFAULT_RECENT_HISTORY_MINUTES = 60.0
|
||
|
||
|
||
def _normalize_start_page(start_page: int | None) -> int:
|
||
value = int(start_page or 1)
|
||
if value < 1:
|
||
raise ValueError(f"start_page must be >= 1, got {start_page}")
|
||
return value
|
||
|
||
|
||
def _resolve_country_page_window(
|
||
*,
|
||
detected_max_page: int,
|
||
start_page: int = 1,
|
||
max_pages_per_country: Optional[int] = DEFAULT_SYNC_MAX_PAGES_PER_COUNTRY,
|
||
) -> list[int]:
|
||
start = _normalize_start_page(start_page)
|
||
if detected_max_page < start:
|
||
return []
|
||
if max_pages_per_country is None:
|
||
end_page = detected_max_page
|
||
else:
|
||
if max_pages_per_country < 1:
|
||
raise ValueError(f"max_pages_per_country must be >= 1, got {max_pages_per_country}")
|
||
end_page = min(detected_max_page, start + max_pages_per_country - 1)
|
||
return list(range(start, end_page + 1))
|
||
|
||
|
||
def _normalize_message_text_for_otp(message_text: str) -> str:
|
||
text = OTP_BIDI_CHARS_RE.sub("", message_text or "")
|
||
return text.strip()
|
||
|
||
|
||
def _extract_otp_from_text(
|
||
message_text: str,
|
||
*,
|
||
min_digits: int = 4,
|
||
max_digits: int = 8,
|
||
) -> Optional[str]:
|
||
text = _normalize_message_text_for_otp(message_text)
|
||
if not text:
|
||
return None
|
||
|
||
for match in OTP_SPLIT_CANDIDATE_RE.finditer(text):
|
||
digits = re.sub(OTP_SEPARATOR_CHARS, "", match.group(1))
|
||
if min_digits <= len(digits) <= max_digits:
|
||
return digits
|
||
return None
|
||
|
||
|
||
def _extract_recent_6digit_otp(message_text: str, received_text: str) -> Optional[str]:
|
||
"""优先匹配“最近约 1 分钟内”的 6 位验证码。"""
|
||
|
||
msg = (message_text or "").strip()
|
||
recv = (received_text or "").strip().lower()
|
||
if not msg:
|
||
return None
|
||
|
||
recent_markers = (
|
||
"just now",
|
||
"few seconds",
|
||
"second ago",
|
||
"seconds ago",
|
||
"sec ago",
|
||
"secs ago",
|
||
"now",
|
||
)
|
||
is_recent = any(marker in recv for marker in recent_markers)
|
||
|
||
if not is_recent:
|
||
# 兼容 "1 min ago" / "1 minute ago" 等形式
|
||
minute_match = re.search(r"(\d+)\s*(m|min|mins|minute|minutes)\b", recv)
|
||
if minute_match:
|
||
is_recent = int(minute_match.group(1)) <= 1
|
||
|
||
if not is_recent:
|
||
return None
|
||
|
||
return _extract_otp_from_text(msg, min_digits=6, max_digits=6)
|
||
|
||
|
||
def _parse_received_age_minutes(received_text: str) -> Optional[float]:
|
||
recv = (received_text or "").strip().lower()
|
||
if not recv:
|
||
return None
|
||
|
||
immediate_markers = (
|
||
"just now",
|
||
"few seconds",
|
||
"second ago",
|
||
"seconds ago",
|
||
"sec ago",
|
||
"secs ago",
|
||
"moments ago",
|
||
"now",
|
||
)
|
||
if any(marker in recv for marker in immediate_markers):
|
||
return 0.0
|
||
|
||
if re.search(r"\ban?\s+(m|min|mins|minute|minutes)\b", recv):
|
||
return 1.0
|
||
if re.search(r"\ban?\s+(h|hr|hrs|hour|hours)\b", recv):
|
||
return 60.0
|
||
if "yesterday" in recv:
|
||
return 24.0 * 60.0
|
||
|
||
match = re.search(
|
||
r"(\d+)\s*(s|sec|secs|second|seconds|m|min|mins|minute|minutes|h|hr|hrs|hour|hours|d|day|days)\b",
|
||
recv,
|
||
)
|
||
if not match:
|
||
return None
|
||
|
||
value = int(match.group(1))
|
||
unit = match.group(2)
|
||
if unit.startswith("s"):
|
||
return value / 60.0
|
||
if unit.startswith("m"):
|
||
return float(value)
|
||
if unit.startswith("h"):
|
||
return float(value) * 60.0
|
||
if unit.startswith("d"):
|
||
return float(value) * 24.0 * 60.0
|
||
return None
|
||
|
||
|
||
@dataclass(frozen=True)
|
||
class PhoneEntry:
|
||
"""代表一个 SMSToMe 手机号记录。"""
|
||
|
||
country_slug: str
|
||
phone: str # e.g. "+48573583699"
|
||
detail_url: str # e.g. "https://smstome.com/poland/phone/48573583699/sms/14642"
|
||
|
||
|
||
@dataclass(frozen=True)
|
||
class SmsMessage:
|
||
"""单条短信记录。"""
|
||
|
||
from_label: str
|
||
received_text: str
|
||
message_text: str
|
||
|
||
|
||
class SmsOtpPollingError(RuntimeError):
|
||
pass
|
||
|
||
|
||
class SmsInboxEmptyError(SmsOtpPollingError):
|
||
pass
|
||
|
||
|
||
class SmsOtpTimeoutError(SmsOtpPollingError):
|
||
pass
|
||
|
||
|
||
class SmsOtpFetchError(SmsOtpPollingError):
|
||
pass
|
||
|
||
|
||
def _summarize_sms_message(message: SmsMessage | None, *, max_len: int = 96) -> str:
|
||
if message is None:
|
||
return "none"
|
||
snippet = " ".join((message.message_text or "").split())
|
||
if len(snippet) > max_len:
|
||
snippet = snippet[: max_len - 3] + "..."
|
||
return (
|
||
f"from={message.from_label!r}, received={message.received_text!r}, "
|
||
f"text={snippet!r}"
|
||
)
|
||
|
||
|
||
def _classify_timeout_state(
|
||
*,
|
||
latest_message: SmsMessage | None,
|
||
unmatched_new_message_count: int,
|
||
) -> str:
|
||
if latest_message is None:
|
||
return "empty-inbox"
|
||
if unmatched_new_message_count > 0:
|
||
return "new-messages-no-otp"
|
||
return "stale-inbox-no-new-messages"
|
||
|
||
|
||
def _has_recent_sms_history(
|
||
messages: Iterable[SmsMessage],
|
||
*,
|
||
max_age_minutes: float = DEFAULT_RECENT_HISTORY_MINUTES,
|
||
) -> bool:
|
||
for message in messages:
|
||
age_minutes = _parse_received_age_minutes(message.received_text)
|
||
if age_minutes is None:
|
||
continue
|
||
if age_minutes <= max_age_minutes:
|
||
return True
|
||
return False
|
||
|
||
|
||
def _parse_cookie_header(cookie_header: str) -> Dict[str, str]:
|
||
"""将浏览器复制的 Cookie 字符串解析为字典。
|
||
|
||
例如:
|
||
"a=1; b=2; cf_clearance=xxx" -> {"a": "1", "b": "2", "cf_clearance": "xxx"}
|
||
"""
|
||
|
||
cookies: Dict[str, str] = {}
|
||
for part in cookie_header.split(";"):
|
||
part = part.strip()
|
||
if not part or "=" not in part:
|
||
continue
|
||
name, value = part.split("=", 1)
|
||
name = name.strip()
|
||
value = value.strip()
|
||
if name:
|
||
cookies[name] = value
|
||
return cookies
|
||
|
||
|
||
def _load_cookie_from_config(config_path: Path | str | None = None) -> Optional[str]:
|
||
try:
|
||
from core.config_store import config_store
|
||
|
||
stored = str(config_store.get("smstome_cookie", "") or "").strip()
|
||
if stored:
|
||
return stored
|
||
except Exception:
|
||
pass
|
||
|
||
config = load_yaml_config(config_path or DEFAULT_CONFIG_PATH)
|
||
return get_nonempty_str(config, "SMSTOME_COOKIE", "smstome_cookie")
|
||
|
||
|
||
def _resolve_cookie_header(cookie_header: Optional[str]) -> str:
|
||
explicit_cookie = (cookie_header or "").strip()
|
||
if explicit_cookie:
|
||
return explicit_cookie
|
||
|
||
env_cookie = os.getenv("SMSTOME_COOKIE", "").strip()
|
||
if env_cookie:
|
||
return env_cookie
|
||
|
||
return _load_cookie_from_config() or ""
|
||
|
||
|
||
def _build_client(*, cookie_header: Optional[str], timeout: float) -> httpx.Client:
|
||
"""构造 httpx.Client,注入 UA 和可选 Cookie,禁用系统代理。"""
|
||
|
||
headers = dict(DEFAULT_HEADERS)
|
||
cookie_header = _resolve_cookie_header(cookie_header)
|
||
|
||
cookies: Dict[str, str] = {}
|
||
if cookie_header:
|
||
cookies.update(_parse_cookie_header(cookie_header))
|
||
|
||
client = httpx.Client(
|
||
headers=headers,
|
||
cookies=cookies,
|
||
timeout=timeout,
|
||
follow_redirects=True,
|
||
trust_env=False, # 不继承环境代理,避免影响 Tavily 流量策略
|
||
)
|
||
return client
|
||
|
||
|
||
def _polite_sleep(base_delay: float, jitter: float) -> None:
|
||
"""在请求之间添加一点随机延迟,用于简单规避风控。
|
||
|
||
Args:
|
||
base_delay: 基础延迟秒数,<=0 表示不等待。
|
||
jitter: 抖动上限秒数,>0 时会在 [0, jitter] 之间随机增加额外延迟。
|
||
"""
|
||
|
||
if base_delay <= 0:
|
||
return
|
||
extra = random.uniform(0, jitter) if jitter > 0 else 0.0
|
||
time.sleep(base_delay + extra)
|
||
|
||
|
||
def _fetch_with_retries(
|
||
client: httpx.Client,
|
||
url: str,
|
||
*,
|
||
max_attempts: int = 3,
|
||
backoff_factor: float = 0.5,
|
||
) -> str:
|
||
"""带简单重试的 GET 请求,返回文本内容。
|
||
|
||
- 对网络异常 / 5xx 做有限次重试;
|
||
- 对 4xx(例如 403/404)不做额外特殊处理,直接抛出。
|
||
"""
|
||
|
||
last_exc: Optional[Exception] = None
|
||
for attempt in range(1, max_attempts + 1):
|
||
try:
|
||
resp = client.get(url)
|
||
resp.raise_for_status()
|
||
return resp.text
|
||
except (httpx.RequestError, httpx.HTTPStatusError) as exc: # noqa: PERF203
|
||
last_exc = exc
|
||
# 4xx 错误通常不需要重试
|
||
status = getattr(exc, "response", None)
|
||
status_code = getattr(status, "status_code", None)
|
||
if isinstance(status_code, int) and 400 <= status_code < 500:
|
||
raise
|
||
|
||
if attempt >= max_attempts:
|
||
raise
|
||
sleep_s = backoff_factor * attempt
|
||
time.sleep(sleep_s)
|
||
|
||
# 正常逻辑不会走到这里
|
||
raise RuntimeError(f"Failed to fetch {url!r}: {last_exc}")
|
||
|
||
|
||
def _detect_max_page(tree: HTMLParser) -> int:
|
||
"""从国家列表页中解析最大页码,若没有分页则返回 1。"""
|
||
|
||
max_page = 1
|
||
# 仅关注包含 `?page=` 的链接,避免抓到其它数字
|
||
for a in tree.css("a[href*='?page=']"):
|
||
text = (a.text() or "").strip()
|
||
if text.isdigit():
|
||
try:
|
||
value = int(text)
|
||
except ValueError:
|
||
continue
|
||
if value > max_page:
|
||
max_page = value
|
||
return max_page
|
||
|
||
|
||
def _collect_numbers_from_country_page(
|
||
tree: HTMLParser,
|
||
country_slug: str,
|
||
phone_map: Dict[str, PhoneEntry],
|
||
) -> None:
|
||
"""从单个国家页解析所有号码并写入 phone_map。"""
|
||
|
||
for article in tree.css("article"):
|
||
link = article.css_first("a[href*='/phone/']")
|
||
if link is None:
|
||
continue
|
||
phone_text = (link.text() or "").strip()
|
||
if not phone_text:
|
||
continue
|
||
href = (link.attributes.get("href") or "").strip()
|
||
if not href:
|
||
continue
|
||
|
||
detail_url = urljoin(SMSTOME_BASE_URL + "/", href)
|
||
# 以手机号去重,后出现的记录会覆盖之前的(一般无影响)
|
||
phone_map[phone_text] = PhoneEntry(
|
||
country_slug=country_slug,
|
||
phone=phone_text,
|
||
detail_url=detail_url,
|
||
)
|
||
|
||
|
||
def _find_phone_entry_on_country_page(
|
||
tree: HTMLParser,
|
||
*,
|
||
phone: str,
|
||
country_slug: str,
|
||
) -> Optional[PhoneEntry]:
|
||
target_phone = (phone or "").strip()
|
||
if not target_phone:
|
||
return None
|
||
|
||
phone_map: Dict[str, PhoneEntry] = {}
|
||
_collect_numbers_from_country_page(tree, country_slug, phone_map)
|
||
return phone_map.get(target_phone)
|
||
|
||
|
||
def resolve_live_phone_entry(
|
||
entry: PhoneEntry,
|
||
*,
|
||
cookie_header: Optional[str] = None,
|
||
request_timeout: float = 20.0,
|
||
http_max_attempts: int = 3,
|
||
max_pages_per_country: Optional[int] = None,
|
||
start_page: int = 1,
|
||
per_page_delay: float = 0.0,
|
||
jitter: float = 0.0,
|
||
) -> Optional[PhoneEntry]:
|
||
detail_host = (urlsplit(entry.detail_url).netloc or "").lower()
|
||
if "smstome.com" not in detail_host:
|
||
return entry
|
||
|
||
client = _build_client(cookie_header=cookie_header, timeout=request_timeout)
|
||
try:
|
||
first_url = f"{SMSTOME_BASE_URL}/country/{entry.country_slug}"
|
||
first_page_html = _fetch_with_retries(client, first_url, max_attempts=http_max_attempts)
|
||
first_tree = HTMLParser(first_page_html)
|
||
page_window = _resolve_country_page_window(
|
||
detected_max_page=_detect_max_page(first_tree),
|
||
start_page=start_page,
|
||
max_pages_per_country=max_pages_per_country,
|
||
)
|
||
if not page_window:
|
||
return entry
|
||
|
||
for index, page in enumerate(page_window):
|
||
if page == 1:
|
||
html = first_page_html
|
||
else:
|
||
if index > 0:
|
||
_polite_sleep(per_page_delay, jitter)
|
||
html = _fetch_with_retries(
|
||
client,
|
||
f"{first_url}?page={page}",
|
||
max_attempts=http_max_attempts,
|
||
)
|
||
tree = HTMLParser(html)
|
||
resolved = _find_phone_entry_on_country_page(
|
||
tree,
|
||
phone=entry.phone,
|
||
country_slug=entry.country_slug,
|
||
)
|
||
if resolved is not None:
|
||
return resolved
|
||
if index + 1 < len(page_window):
|
||
_polite_sleep(per_page_delay, jitter)
|
||
return entry
|
||
finally:
|
||
client.close()
|
||
|
||
|
||
def update_global_phone_list(
|
||
*,
|
||
cookie_header: Optional[str] = None,
|
||
countries: Optional[Iterable[str]] = None,
|
||
output_path: Path | str = GLOBAL_PHONE_FILE,
|
||
request_timeout: float = 20.0,
|
||
http_max_attempts: int = 3,
|
||
max_pages_per_country: Optional[int] = DEFAULT_SYNC_MAX_PAGES_PER_COUNTRY,
|
||
start_page: int = 1,
|
||
per_page_delay: float = 1.0,
|
||
per_country_delay: float = 3.0,
|
||
jitter: float = 0.5,
|
||
require_recent_history: bool = True,
|
||
recent_history_minutes: float = DEFAULT_RECENT_HISTORY_MINUTES,
|
||
) -> int:
|
||
"""抓取多个国家的号码并写入 txt 文件。
|
||
|
||
txt 格式:每行 `phone\tcountry_slug\tdetail_url`,例如:
|
||
|
||
+48573583699 poland https://smstome.com/poland/phone/48573583699/sms/14642
|
||
|
||
Args:
|
||
cookie_header: 可选的 Cookie 字符串;若为 None,则尝试从
|
||
`SMSTOME_COOKIE` 环境变量,再回退到仓库根目录 `config.yaml`
|
||
读取。
|
||
countries: 需要同步的国家 slug 列表;若为 None,则使用
|
||
DEFAULT_COUNTRY_SLUGS。
|
||
output_path: 全量号码 txt 文件路径。
|
||
request_timeout: HTTP 请求超时时间(秒)。
|
||
http_max_attempts: 单个请求的最大重试次数。
|
||
max_pages_per_country: 从 start_page 开始,最多抓取多少页,默认 5。
|
||
start_page: 每个国家从第几页开始抓,默认 1。
|
||
per_page_delay: 每翻一页之间的基础延迟(秒),默认 1s。
|
||
per_country_delay: 每个国家抓取完成后的基础延迟(秒),默认 3s。
|
||
jitter: 额外抖动上限(秒),会在 [0, jitter] 内随机增加到延迟上,
|
||
用于让访问节奏更“人类化”。
|
||
|
||
Returns:
|
||
写入文件的去重后手机号数量。
|
||
"""
|
||
|
||
if countries is None:
|
||
countries = DEFAULT_COUNTRY_SLUGS
|
||
|
||
client = _build_client(cookie_header=cookie_header, timeout=request_timeout)
|
||
try:
|
||
phone_map: Dict[str, PhoneEntry] = {}
|
||
|
||
for country_slug in countries:
|
||
first_url = f"{SMSTOME_BASE_URL}/country/{country_slug}"
|
||
first_page_html = _fetch_with_retries(client, first_url, max_attempts=http_max_attempts)
|
||
first_tree = HTMLParser(first_page_html)
|
||
page_window = _resolve_country_page_window(
|
||
detected_max_page=_detect_max_page(first_tree),
|
||
start_page=start_page,
|
||
max_pages_per_country=max_pages_per_country,
|
||
)
|
||
|
||
for index, page in enumerate(page_window):
|
||
if page == 1:
|
||
html = first_page_html
|
||
else:
|
||
if index > 0:
|
||
_polite_sleep(per_page_delay, jitter)
|
||
url = f"{first_url}?page={page}"
|
||
html = _fetch_with_retries(client, url, max_attempts=http_max_attempts)
|
||
tree = HTMLParser(html)
|
||
_collect_numbers_from_country_page(tree, country_slug, phone_map)
|
||
if page == 1 and index + 1 < len(page_window):
|
||
_polite_sleep(per_page_delay, jitter)
|
||
|
||
# 每个国家抓取完后再稍微停顿一下
|
||
_polite_sleep(per_country_delay, jitter)
|
||
|
||
if require_recent_history:
|
||
filtered_phone_map: Dict[str, PhoneEntry] = {}
|
||
for phone in sorted(phone_map.keys()):
|
||
entry = phone_map[phone]
|
||
try:
|
||
messages = _fetch_sms_messages(
|
||
client,
|
||
entry.detail_url,
|
||
http_max_attempts=http_max_attempts,
|
||
)
|
||
except Exception:
|
||
continue
|
||
if _has_recent_sms_history(
|
||
messages,
|
||
max_age_minutes=recent_history_minutes,
|
||
):
|
||
filtered_phone_map[phone] = entry
|
||
phone_map = filtered_phone_map
|
||
|
||
output = Path(output_path)
|
||
output.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 仅要求“记录全量号码”,但为了后续方便,额外保存国家与详情 URL。
|
||
with output.open("w", encoding="utf-8") as f:
|
||
for phone in sorted(phone_map.keys()):
|
||
entry = phone_map[phone]
|
||
f.write(f"{entry.phone}\t{entry.country_slug}\t{entry.detail_url}\n")
|
||
|
||
return len(phone_map)
|
||
finally:
|
||
client.close()
|
||
|
||
|
||
def load_global_phone_index(path: Path | str = GLOBAL_PHONE_FILE) -> Dict[str, PhoneEntry]:
|
||
"""从全量号码 txt 文件中加载索引。"""
|
||
|
||
phone_index: Dict[str, PhoneEntry] = {}
|
||
file_path = Path(path)
|
||
if not file_path.exists():
|
||
raise FileNotFoundError(f"Global phone list not found: {file_path}")
|
||
|
||
with file_path.open("r", encoding="utf-8") as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if not line or line.startswith("#"):
|
||
continue
|
||
parts = line.split("\t")
|
||
if len(parts) < 3:
|
||
continue
|
||
phone, country_slug, detail_url = parts[0], parts[1], parts[2]
|
||
phone_index[phone] = PhoneEntry(
|
||
country_slug=country_slug,
|
||
phone=phone,
|
||
detail_url=detail_url,
|
||
)
|
||
|
||
return phone_index
|
||
|
||
|
||
def _sanitize_task_name(task_name: str) -> str:
|
||
"""将任务名转换为适合作为文件名的形式。"""
|
||
|
||
return re.sub(r"[^a-zA-Z0-9_.-]", "_", task_name)
|
||
|
||
|
||
def _used_numbers_file(task_name: str, *, base_dir: Path | str = USED_NUMBERS_DIR) -> Path:
|
||
"""返回某个任务对应的“已使用号码”文件路径。"""
|
||
|
||
safe_name = _sanitize_task_name(task_name)
|
||
directory = Path(base_dir)
|
||
directory.mkdir(parents=True, exist_ok=True)
|
||
return directory / f"{safe_name}{USED_NUMBERS_SUFFIX}"
|
||
|
||
|
||
def _blacklisted_numbers_file(task_name: str, *, base_dir: Path | str = USED_NUMBERS_DIR) -> Path:
|
||
"""返回某个任务对应的“黑名单号码”文件路径。"""
|
||
|
||
safe_name = _sanitize_task_name(task_name)
|
||
directory = Path(base_dir)
|
||
directory.mkdir(parents=True, exist_ok=True)
|
||
return directory / f"{safe_name}{BLACKLISTED_NUMBERS_SUFFIX}"
|
||
|
||
|
||
def _load_phone_set(path: Path) -> set[str]:
|
||
values: set[str] = set()
|
||
if not path.exists():
|
||
return values
|
||
with path.open("r", encoding="utf-8") as f:
|
||
for line in f:
|
||
value = line.strip()
|
||
if value:
|
||
values.add(value)
|
||
return values
|
||
|
||
|
||
def _phone_prefix_hint(phone: str, *, width: int = PHONE_PREFIX_WIDTH) -> str:
|
||
value = (phone or "").strip()
|
||
if not value:
|
||
return ""
|
||
return value[: min(len(value), width)]
|
||
|
||
|
||
def mark_phone_blacklisted(
|
||
task_name: str,
|
||
phone: str,
|
||
*,
|
||
used_numbers_dir: Path | str = USED_NUMBERS_DIR,
|
||
) -> None:
|
||
phone_value = (phone or "").strip()
|
||
if not phone_value:
|
||
return
|
||
|
||
blacklist_file = _blacklisted_numbers_file(task_name, base_dir=used_numbers_dir)
|
||
existing = _load_phone_set(blacklist_file)
|
||
if phone_value in existing:
|
||
return
|
||
with blacklist_file.open("a", encoding="utf-8") as f:
|
||
f.write(phone_value + "\n")
|
||
|
||
|
||
def parse_country_slugs(country_slug: Optional[str | Iterable[str]]) -> list[str]:
|
||
if country_slug is None:
|
||
return []
|
||
|
||
if isinstance(country_slug, str):
|
||
raw_parts = re.split(r"[\s,;|]+", country_slug.strip())
|
||
else:
|
||
raw_parts = []
|
||
for item in country_slug:
|
||
raw_parts.extend(re.split(r"[\s,;|]+", str(item).strip()))
|
||
|
||
normalized: list[str] = []
|
||
seen: set[str] = set()
|
||
for part in raw_parts:
|
||
value = part.strip().lower().replace("_", "-")
|
||
if not value or value in seen:
|
||
continue
|
||
seen.add(value)
|
||
normalized.append(value)
|
||
return normalized
|
||
|
||
|
||
def get_unused_phone(
|
||
task_name: str,
|
||
*,
|
||
country_slug: Optional[str | Iterable[str]] = None,
|
||
global_file: Path | str = GLOBAL_PHONE_FILE,
|
||
used_numbers_dir: Path | str = USED_NUMBERS_DIR,
|
||
exclude_prefixes: Optional[Iterable[str]] = None,
|
||
) -> Optional[PhoneEntry]:
|
||
"""返回一个对指定任务尚未使用过的手机号,并立即标记为已使用。
|
||
|
||
调用者应在调用前先运行一次 `update_global_phone_list`,确保
|
||
`global_file` 是最新的。
|
||
|
||
Args:
|
||
task_name: 任务名称(例如目标站点标识),用于区分不同任务的
|
||
使用记录文件。
|
||
country_slug: 若指定,则仅从该国家或国家列表中选择;支持单个
|
||
slug、逗号分隔字符串或可迭代 slug 集合。为 None 表示任意国家。
|
||
global_file: 全量号码文件路径。
|
||
used_numbers_dir: 每个任务“已使用号码”文件所在目录。
|
||
exclude_prefixes: 可选的手机号前缀黑名单;用于在单次流程里避开
|
||
已明确被目标站点拒绝的号段。
|
||
|
||
Returns:
|
||
未使用过的 PhoneEntry;若没有可用号码则返回 None。
|
||
"""
|
||
|
||
phone_index = load_global_phone_index(global_file)
|
||
|
||
used_file = _used_numbers_file(task_name, base_dir=used_numbers_dir)
|
||
blacklist_file = _blacklisted_numbers_file(task_name, base_dir=used_numbers_dir)
|
||
used_numbers = _load_phone_set(used_file)
|
||
blacklisted_numbers = _load_phone_set(blacklist_file)
|
||
excluded_prefixes = {
|
||
prefix
|
||
for prefix in (_phone_prefix_hint(value) for value in (exclude_prefixes or ()))
|
||
if prefix
|
||
}
|
||
|
||
country_slugs = parse_country_slugs(country_slug)
|
||
|
||
candidates = [
|
||
entry
|
||
for entry in phone_index.values()
|
||
if (not country_slugs or entry.country_slug in country_slugs)
|
||
and entry.phone not in used_numbers
|
||
and entry.phone not in blacklisted_numbers
|
||
and _phone_prefix_hint(entry.phone) not in excluded_prefixes
|
||
]
|
||
if not candidates:
|
||
return None
|
||
|
||
remaining = list(candidates)
|
||
while remaining:
|
||
entry = random.choice(remaining)
|
||
remaining.remove(entry)
|
||
try:
|
||
refreshed_entry = resolve_live_phone_entry(entry)
|
||
except Exception:
|
||
refreshed_entry = entry
|
||
if refreshed_entry is None:
|
||
continue
|
||
with used_file.open("a", encoding="utf-8") as f:
|
||
f.write(refreshed_entry.phone + "\n")
|
||
return refreshed_entry
|
||
return None
|
||
|
||
|
||
def _fetch_sms_messages(
|
||
client: httpx.Client,
|
||
detail_url: str,
|
||
*,
|
||
http_max_attempts: int,
|
||
) -> List[SmsMessage]:
|
||
"""抓取某个号码主页(第一页)的短信列表。"""
|
||
|
||
html = _fetch_with_retries(client, detail_url, max_attempts=http_max_attempts)
|
||
tree = HTMLParser(html)
|
||
|
||
# 页面中只有一个主要的短信表格,这里直接取第一个 table 即可。
|
||
table = tree.css_first("table")
|
||
if table is None:
|
||
return []
|
||
|
||
messages: List[SmsMessage] = []
|
||
for tr in table.css("tr"):
|
||
# 跳过表头行(包含 th)
|
||
if tr.css_first("th") is not None:
|
||
continue
|
||
tds = tr.css("td")
|
||
if len(tds) < 3:
|
||
continue
|
||
from_label = tds[0].text(strip=True)
|
||
received_text = tds[1].text(strip=True)
|
||
message_text = tds[2].text(separator=" ", strip=True)
|
||
if not message_text:
|
||
continue
|
||
messages.append(
|
||
SmsMessage(
|
||
from_label=from_label,
|
||
received_text=received_text,
|
||
message_text=message_text,
|
||
)
|
||
)
|
||
|
||
return messages
|
||
|
||
|
||
def wait_for_otp(
|
||
entry: PhoneEntry,
|
||
*,
|
||
cookie_header: Optional[str] = None,
|
||
timeout: float = 120.0,
|
||
poll_interval: float = 5.0,
|
||
otp_regex: str = r"\b(\d{4,8})\b",
|
||
http_max_attempts: int = 3,
|
||
trace: Callable[[str], None] | None = None,
|
||
raise_on_timeout: bool = False,
|
||
) -> Optional[str]:
|
||
"""轮询指定手机号短信,提取验证码并返回。
|
||
|
||
基本逻辑:
|
||
1. 启动时抓取一次当前短信列表,记录为已见;
|
||
2. 在给定 `timeout` 内,每隔 `poll_interval` 秒重新抓取;
|
||
3. 对每条“未见过”的短信,用 `otp_regex` 匹配验证码;
|
||
4. 匹配成功则返回第一个验证码;超时则返回 None。
|
||
|
||
Args:
|
||
entry: 通过 `get_unused_phone` 或其它方式得到的 PhoneEntry。
|
||
cookie_header: 可选 Cookie 字符串;若为 None,则尝试从
|
||
`SMSTOME_COOKIE` 环境变量,再回退到仓库根目录 `config.yaml`
|
||
读取。
|
||
timeout: 最大等待时间(秒)。
|
||
poll_interval: 轮询间隔(秒)。
|
||
otp_regex: 用于从短信中提取验证码的正则,默认匹配 4–8 位数字。
|
||
http_max_attempts: 每次抓取短信时的 HTTP 重试次数。
|
||
trace: 可选日志回调;若提供,会输出每轮轮询的诊断摘要。
|
||
raise_on_timeout: 若为 True,超时后抛出更具体的异常,而不是返回 None。
|
||
|
||
Returns:
|
||
匹配到的验证码字符串;若超时未获得则返回 None。
|
||
"""
|
||
|
||
client = _build_client(cookie_header=cookie_header, timeout=timeout)
|
||
pattern = re.compile(otp_regex)
|
||
emit = trace or (lambda _msg: None)
|
||
|
||
seen_messages: set[str] = set()
|
||
unmatched_new_message_count = 0
|
||
latest_unmatched_message: SmsMessage | None = None
|
||
|
||
def _fetch_messages(phase: str, *, poll_number: int | None = None) -> List[SmsMessage]:
|
||
try:
|
||
return _fetch_sms_messages(
|
||
client, entry.detail_url, http_max_attempts=http_max_attempts
|
||
)
|
||
except Exception as exc:
|
||
label = f"{phase} fetch-error"
|
||
if poll_number is not None:
|
||
label += f" poll={poll_number}"
|
||
emit(f"{label} type={type(exc).__name__} error={exc}")
|
||
raise SmsOtpFetchError(
|
||
f"SMSToMe {phase} fetch failed for {entry.phone}: {exc}"
|
||
) from exc
|
||
|
||
# 初始抓取,避免把历史短信误当成“新短信”
|
||
initial_messages = _fetch_messages("initial")
|
||
latest_message = initial_messages[0] if initial_messages else None
|
||
latest_snapshot = (
|
||
latest_message.from_label,
|
||
latest_message.received_text,
|
||
latest_message.message_text,
|
||
) if latest_message else None
|
||
emit(
|
||
f"poll start phone={entry.phone} messages={len(initial_messages)} "
|
||
f"latest={_summarize_sms_message(latest_message)}"
|
||
)
|
||
if initial_messages:
|
||
quick_otp = _extract_recent_6digit_otp(
|
||
latest_message.message_text,
|
||
latest_message.received_text,
|
||
)
|
||
if quick_otp:
|
||
emit(
|
||
"matched quick recent OTP "
|
||
f"code={quick_otp} latest={_summarize_sms_message(latest_message)}"
|
||
)
|
||
return quick_otp
|
||
|
||
for msg in initial_messages:
|
||
seen_messages.add(msg.message_text)
|
||
|
||
deadline = time.monotonic() + timeout
|
||
poll_count = 0
|
||
|
||
while True:
|
||
remaining = deadline - time.monotonic()
|
||
if remaining <= 0:
|
||
timeout_state = _classify_timeout_state(
|
||
latest_message=latest_message,
|
||
unmatched_new_message_count=unmatched_new_message_count,
|
||
)
|
||
summary = (
|
||
f"final state={timeout_state} polls={poll_count} "
|
||
f"latest={_summarize_sms_message(latest_message)}"
|
||
)
|
||
if latest_unmatched_message is not None:
|
||
summary += (
|
||
" first_unmatched_new="
|
||
+ _summarize_sms_message(latest_unmatched_message)
|
||
)
|
||
emit(summary)
|
||
emit(
|
||
f"timeout after {poll_count} poll(s); latest={_summarize_sms_message(latest_message)}"
|
||
)
|
||
if raise_on_timeout:
|
||
if latest_message is None:
|
||
raise SmsInboxEmptyError(
|
||
f"SMSToMe inbox stayed empty for {entry.phone} after {poll_count} poll(s)"
|
||
)
|
||
raise SmsOtpTimeoutError(
|
||
f"SMSToMe OTP timeout state={timeout_state} for {entry.phone} "
|
||
f"after {poll_count} poll(s); latest={_summarize_sms_message(latest_message)}"
|
||
)
|
||
return None
|
||
|
||
sleep_s = min(poll_interval, max(remaining, 0))
|
||
if sleep_s > 0:
|
||
time.sleep(sleep_s)
|
||
|
||
poll_count += 1
|
||
messages = _fetch_messages("poll", poll_number=poll_count)
|
||
latest_message = messages[0] if messages else None
|
||
current_snapshot = (
|
||
latest_message.from_label,
|
||
latest_message.received_text,
|
||
latest_message.message_text,
|
||
) if latest_message else None
|
||
new_count = sum(1 for msg in messages if msg.message_text not in seen_messages)
|
||
if poll_count <= 3 or current_snapshot != latest_snapshot or new_count:
|
||
emit(
|
||
f"poll {poll_count}: messages={len(messages)} new={new_count} "
|
||
f"latest={_summarize_sms_message(latest_message)}"
|
||
)
|
||
latest_snapshot = current_snapshot
|
||
if messages:
|
||
quick_otp = _extract_recent_6digit_otp(
|
||
latest_message.message_text,
|
||
latest_message.received_text,
|
||
)
|
||
if quick_otp:
|
||
emit(
|
||
"matched quick recent OTP "
|
||
f"code={quick_otp} latest={_summarize_sms_message(latest_message)}"
|
||
)
|
||
return quick_otp
|
||
|
||
for msg in messages:
|
||
if msg.message_text in seen_messages:
|
||
continue
|
||
seen_messages.add(msg.message_text)
|
||
unmatched_new_message_count += 1
|
||
latest_unmatched_message = msg
|
||
normalized_text = _normalize_message_text_for_otp(msg.message_text)
|
||
match = pattern.search(normalized_text)
|
||
if match:
|
||
code = re.sub(OTP_SEPARATOR_CHARS, "", match.group(1))
|
||
emit(f"matched regex OTP code={code} message={_summarize_sms_message(msg)}")
|
||
return code
|
||
fallback_otp = _extract_otp_from_text(msg.message_text)
|
||
if fallback_otp:
|
||
emit(
|
||
f"matched fallback OTP code={fallback_otp} "
|
||
f"message={_summarize_sms_message(msg)}"
|
||
)
|
||
return fallback_otp
|
||
if new_count and latest_unmatched_message is not None:
|
||
emit(
|
||
"new messages arrived without OTP match "
|
||
f"count={new_count} sample={_summarize_sms_message(latest_unmatched_message)}"
|
||
)
|
||
|
||
|
||
if __name__ == "__main__": # pragma: no cover - 简单调试入口
|
||
import argparse
|
||
|
||
parser = argparse.ArgumentParser(
|
||
description="SMSToMe phone pool & OTP helper",
|
||
)
|
||
|
||
subparsers = parser.add_subparsers(dest="command", required=True)
|
||
|
||
sync_parser = subparsers.add_parser(
|
||
"sync", help="同步全量手机号到 txt 文件",
|
||
)
|
||
sync_parser.add_argument(
|
||
"--cookie",
|
||
dest="cookie",
|
||
help="可选 Cookie 字符串;为空则使用 SMSTOME_COOKIE 环境变量或 config.yaml",
|
||
)
|
||
sync_parser.add_argument(
|
||
"--max-pages-per-country",
|
||
dest="max_pages_per_country",
|
||
type=int,
|
||
default=DEFAULT_SYNC_MAX_PAGES_PER_COUNTRY,
|
||
help=(
|
||
"从起始页开始,每个国家最多抓取多少页;"
|
||
f"默认 {DEFAULT_SYNC_MAX_PAGES_PER_COUNTRY}"
|
||
),
|
||
)
|
||
sync_parser.add_argument(
|
||
"--start-page",
|
||
dest="start_page",
|
||
type=int,
|
||
default=1,
|
||
help="每个国家从第几页开始抓;默认 1",
|
||
)
|
||
sync_parser.add_argument(
|
||
"--countries",
|
||
dest="countries",
|
||
help="可选国家 slug 列表;支持单个 slug 或逗号分隔,例如 united-kingdom,sweden",
|
||
)
|
||
sync_parser.add_argument(
|
||
"--output",
|
||
dest="output_path",
|
||
default=str(GLOBAL_PHONE_FILE),
|
||
help=f"同步结果输出文件;默认 {GLOBAL_PHONE_FILE}",
|
||
)
|
||
sync_parser.add_argument(
|
||
"--skip-history-check",
|
||
dest="skip_history_check",
|
||
action="store_true",
|
||
help="不同步详情页历史活跃度;默认会过滤掉没有分钟级历史短信的号码",
|
||
)
|
||
sync_parser.add_argument(
|
||
"--recent-history-minutes",
|
||
dest="recent_history_minutes",
|
||
type=float,
|
||
default=DEFAULT_RECENT_HISTORY_MINUTES,
|
||
help=(
|
||
"同步时仅保留最近 N 分钟内有历史短信的号码;"
|
||
f"默认 {int(DEFAULT_RECENT_HISTORY_MINUTES)}"
|
||
),
|
||
)
|
||
|
||
pick_parser = subparsers.add_parser(
|
||
"pick", help="为某个任务选择一个未使用的手机号",
|
||
)
|
||
pick_parser.add_argument("task", help="任务名称,用于区分已使用号码文件")
|
||
pick_parser.add_argument(
|
||
"--country",
|
||
dest="country",
|
||
help="可选国家 slug(例如 poland、sweden)",
|
||
)
|
||
|
||
args = parser.parse_args()
|
||
|
||
if args.command == "sync":
|
||
count = update_global_phone_list(
|
||
cookie_header=args.cookie,
|
||
countries=parse_country_slugs(args.countries) or None,
|
||
output_path=args.output_path,
|
||
max_pages_per_country=args.max_pages_per_country,
|
||
start_page=args.start_page,
|
||
require_recent_history=not args.skip_history_check,
|
||
recent_history_minutes=args.recent_history_minutes,
|
||
)
|
||
print(f"Synced {count} phone numbers into {args.output_path}")
|
||
elif args.command == "pick":
|
||
entry = get_unused_phone(
|
||
task_name=args.task,
|
||
country_slug=args.country,
|
||
)
|
||
if entry is None:
|
||
print("No unused phone available.")
|
||
else:
|
||
print(
|
||
f"Task={args.task} -> {entry.phone} "
|
||
f"(country={entry.country_slug})",
|
||
)
|