GMB-Scraper/lib/stealth.py
Zulkifli 5e893db025 feat: GMB Scraper v4 — production-grade pain-aware lead gen engine
- Stealth mode: playwright-stealth, random fingerprints, human delays
- Retry logic: exponential backoff (3 attempts)
- Logging: rotating logs to /root/.hermes/logs/gmb/
- Validation: phone/website/rating validation + dedup
- Pain detection: 12 signals, scoring, service matching
- Review scraper: extract reviews + pain keyword detection
- Website health: SSL, speed, mobile, contact form checks
- Pitch generator: Apex pitches (SMS, email, call, Gumtree)
- Docker containerization
- .env for secrets (no hardcoded API keys)
- Integration with Pipecat voice dialer (gmb_to_voice.py)
2026-06-06 19:45:44 +08:00

124 lines
3.7 KiB
Python

"""
Stealth Mode Module
===================
Anti-detection measures for Playwright scraping.
"""
import random
from playwright_stealth import Stealth
from .logger import get_logger
# Realistic viewports (common screen resolutions)
VIEWPORTS = [
(1920, 1080), (1366, 768), (1536, 864),
(1440, 900), (1280, 720), (1600, 900),
(2560, 1440), (1920, 1200), (1680, 1050),
]
# Realistic user agents (rotated to avoid fingerprinting)
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:126.0) Gecko/20100101 Firefox/126.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0",
]
# Timezones for Australian businesses
TIMEZONES = [
"Australia/Perth", "Australia/Adelaide", "Australia/Brisbane",
"Australia/Sydney", "Australia/Melbourne", "Australia/Hobart",
]
# Languages
LANGUAGES = ["en-AU", "en-US", "en-GB"]
def apply_stealth(context, page=None, randomize=False):
"""
Apply stealth measures to Playwright context and page.
Args:
context: Playwright browser context
page: Optional Playwright page (applies stealth to it)
randomize: Ignored (kept for backward compat — randomization happens at context creation)
Returns:
Modified context
"""
logger = get_logger()
# Apply playwright-stealth to page
if page:
stealth = Stealth()
stealth.apply_stealth_sync(page)
logger.debug("Stealth: playwright-stealth applied")
return context
def create_stealth_context(browser, headless=True, proxy=None):
"""
Create a stealth-enabled browser context.
Args:
browser: Playwright browser instance
headless: Whether to run headless
proxy: Optional proxy URL
Returns:
Playwright context with stealth applied
"""
logger = get_logger()
# Base context options
viewport = random.choice(VIEWPORTS)
context_options = {
"viewport": {"width": viewport[0], "height": viewport[1]},
"user_agent": random.choice(USER_AGENTS),
"locale": random.choice(LANGUAGES),
"timezone_id": random.choice(TIMEZONES),
}
# Add proxy if provided
if proxy:
context_options["proxy"] = {"server": proxy}
logger.info(f"Using proxy: {proxy}")
# Create context
context = browser.new_context(**context_options)
# Apply stealth
apply_stealth(context, randomize=False)
return context
def human_delay(min_delay=1.0, max_delay=2.5, jitter=True):
"""
Human-like delay with optional jitter.
Args:
min_delay: Minimum delay in seconds
max_delay: Maximum delay in seconds
jitter: Add random jitter
"""
delay = random.uniform(min_delay, max_delay)
if jitter:
# Occasionally add longer pauses (like a human getting distracted)
if random.random() < 0.1: # 10% chance
delay *= random.uniform(1.5, 2.5)
return delay
def human_scroll_delay():
"""Delay that mimics human scrolling behavior."""
# Most scrolls are quick, some are slow (reading)
if random.random() < 0.7:
return random.uniform(0.5, 1.2)
else:
return random.uniform(1.5, 3.0)