GMB-Scraper/lib/pain_detector.py
Zulkifli 5e893db025 feat: GMB Scraper v4 — production-grade pain-aware lead gen engine
- Stealth mode: playwright-stealth, random fingerprints, human delays
- Retry logic: exponential backoff (3 attempts)
- Logging: rotating logs to /root/.hermes/logs/gmb/
- Validation: phone/website/rating validation + dedup
- Pain detection: 12 signals, scoring, service matching
- Review scraper: extract reviews + pain keyword detection
- Website health: SSL, speed, mobile, contact form checks
- Pitch generator: Apex pitches (SMS, email, call, Gumtree)
- Docker containerization
- .env for secrets (no hardcoded API keys)
- Integration with Pipecat voice dialer (gmb_to_voice.py)
2026-06-06 19:45:44 +08:00

435 lines
13 KiB
Python

"""
Pain Detection Module
=====================
Detect business pain signals and score leads for outreach priority.
Focus: Lead Generation (highest margin service)
"""
import re
from datetime import datetime
from .logger import get_logger
# Pain keywords in reviews (grouped by service type)
PAIN_KEYWORDS = {
'lead_gen': [
'no answer', 'nobody answered', 'didn\'t answer', 'never answer',
'voicemail', 'can\'t reach', 'unreachable', 'no response',
'didn\'t call back', 'no callback', 'never called back',
'phone disconnected', 'wrong number', 'busy signal',
],
'reputation': [
'rude', 'unprofessional', 'terrible', 'awful', 'worst',
'scam', 'rip off', 'overpriced', 'expensive', 'hidden fees',
'waste of time', 'waste of money', 'don\'t trust',
],
'website': [
'website down', 'can\'t find website', 'no website',
'website doesn\'t work', 'broken website', 'outdated website',
'can\'t book online', 'no online booking',
],
'service_quality': [
'slow', 'took forever', 'waited hours', 'long wait',
'unreliable', 'didn\'t show up', 'no show', 'late',
'poor quality', 'bad work', 'shoddy', 'amateur',
],
}
# Pain signals and their weights
PAIN_SIGNALS = {
'no_website': {
'weight': 25,
'service': 'Website Development',
'margin': 'high',
'description': 'No website detected',
},
'broken_website': {
'weight': 20,
'service': 'Website Maintenance',
'margin': 'medium',
'description': 'Website has issues (SSL expired, slow, not mobile-friendly)',
},
'low_rating': {
'weight': 15,
'service': 'Reputation Management',
'margin': 'high',
'description': 'Rating below 3.5 stars',
},
'recent_1star': {
'weight': 20,
'service': 'Review Response Service',
'margin': 'high',
'description': 'Recent 1-star reviews (last 30 days)',
},
'missed_calls': {
'weight': 30,
'service': 'Lead Generation + Call Tracking',
'margin': 'highest',
'description': 'Reviews mention missed calls / no answer',
},
'unclaimed_gmb': {
'weight': 12,
'service': 'GMB Optimization',
'margin': 'medium',
'description': 'Google Business profile appears unclaimed',
},
'missing_phone': {
'weight': 10,
'service': 'GMB Cleanup',
'margin': 'low',
'description': 'Phone number missing from GMB',
},
'no_hours': {
'weight': 5,
'service': 'GMB Optimization',
'margin': 'low',
'description': 'Business hours not listed',
},
'few_reviews': {
'weight': 8,
'service': 'Review Generation Campaign',
'margin': 'medium',
'description': 'Less than 10 reviews total',
},
'no_contact_form': {
'weight': 15,
'service': 'Lead Capture Optimization',
'margin': 'high',
'description': 'Website has no contact form',
},
'slow_website': {
'weight': 10,
'service': 'Website Performance',
'margin': 'medium',
'description': 'Website loads slowly (>3 seconds)',
},
'not_mobile_friendly': {
'weight': 12,
'service': 'Mobile Optimization',
'margin': 'medium',
'description': 'Website not mobile-friendly',
},
}
def detect_review_pain(reviews):
"""
Analyze reviews for pain keywords.
Args:
reviews: List of review dictionaries with 'text', 'rating', 'date'
Returns:
Dictionary of detected pain signals with counts
"""
logger = get_logger()
detected = {}
if not reviews:
return detected
# Analyze each review
for review in reviews:
text = review.get('text', '').lower()
rating = review.get('rating', 5)
review_date = review.get('date', '')
# Check each pain category
for category, keywords in PAIN_KEYWORDS.items():
for keyword in keywords:
if keyword in text:
# Create signal key
if category == 'lead_gen':
signal_key = 'missed_calls'
elif category == 'reputation':
signal_key = 'recent_1star' if rating <= 2 else 'low_rating'
elif category == 'website':
signal_key = 'broken_website'
else:
continue
# Initialize or increment
if signal_key not in detected:
detected[signal_key] = {
'count': 0,
'examples': [],
'signal_info': PAIN_SIGNALS.get(signal_key, {}),
}
detected[signal_key]['count'] += 1
# Store example (limit to 3)
if len(detected[signal_key]['examples']) < 3:
detected[signal_key]['examples'].append({
'text': text[:200],
'rating': rating,
'date': review_date,
})
return detected
def detect_structural_pain(lead):
"""
Detect pain signals from lead structure (missing data).
Args:
lead: Business data dictionary
Returns:
Dictionary of detected structural pain signals
"""
detected = {}
# No website
if not lead.get('website'):
detected['no_website'] = {
'count': 1,
'signal_info': PAIN_SIGNALS['no_website'],
}
# Missing phone
if not lead.get('phone'):
detected['missing_phone'] = {
'count': 1,
'signal_info': PAIN_SIGNALS['missing_phone'],
}
# No hours
if not lead.get('hours'):
detected['no_hours'] = {
'count': 1,
'signal_info': PAIN_SIGNALS['no_hours'],
}
# Low rating
rating = lead.get('rating', 0)
if 0 < rating < 3.5:
detected['low_rating'] = {
'count': 1,
'signal_info': PAIN_SIGNALS['low_rating'],
}
# Few reviews
review_count = lead.get('review_count', 0)
if 0 < review_count < 10:
detected['few_reviews'] = {
'count': 1,
'signal_info': PAIN_SIGNALS['few_reviews'],
}
return detected
def detect_website_pain(health_check):
"""
Detect pain signals from website health check.
Args:
health_check: Dictionary from check_website_health()
Returns:
Dictionary of detected website pain signals
"""
detected = {}
if not health_check:
return detected
# Broken website (SSL issues, unreachable)
if not health_check.get('reachable') or not health_check.get('ssl_valid'):
detected['broken_website'] = {
'count': 1,
'signal_info': PAIN_SIGNALS['broken_website'],
'details': health_check,
}
# Slow website
load_time = health_check.get('load_time', 0)
if load_time > 3.0:
detected['slow_website'] = {
'count': 1,
'signal_info': PAIN_SIGNALS['slow_website'],
'details': {'load_time': load_time},
}
# Not mobile friendly
if not health_check.get('mobile_friendly'):
detected['not_mobile_friendly'] = {
'count': 1,
'signal_info': PAIN_SIGNALS['not_mobile_friendly'],
}
# No contact form
if not health_check.get('has_contact_form'):
detected['no_contact_form'] = {
'count': 1,
'signal_info': PAIN_SIGNALS['no_contact_form'],
}
return detected
def detect_pain_signals(lead, reviews=None, health_check=None):
"""
Detect all pain signals for a lead.
Args:
lead: Business data dictionary
reviews: Optional list of reviews
health_check: Optional website health check results
Returns:
Dictionary with all detected signals and metadata
"""
logger = get_logger()
all_signals = {}
# Structural pain (from lead data)
structural = detect_structural_pain(lead)
all_signals.update(structural)
# Review pain (from review text)
if reviews:
review_pain = detect_review_pain(reviews)
# Merge, preferring review data when both exist
for key, value in review_pain.items():
if key in all_signals:
# Combine counts
all_signals[key]['count'] += value['count']
all_signals[key]['examples'] = value.get('examples', [])
else:
all_signals[key] = value
# Website pain (from health check)
if health_check and lead.get('website'):
website_pain = detect_website_pain(health_check)
all_signals.update(website_pain)
# Calculate total pain score
pain_score = calculate_pain_score(all_signals)
# Determine primary service to pitch (highest margin)
primary_service = get_primary_service(all_signals)
result = {
'signals': all_signals,
'pain_score': pain_score,
'signal_count': len(all_signals),
'primary_service': primary_service,
'confidence': 'high' if pain_score >= 30 else 'medium' if pain_score >= 15 else 'low',
}
if all_signals:
logger.info(
f"Pain detected for '{lead.get('name', 'Unknown')}': "
f"score={pain_score}, signals={len(all_signals)}, "
f"primary={primary_service}"
)
return result
def calculate_pain_score(signals):
"""
Calculate total pain score from detected signals.
Args:
signals: Dictionary of detected signals
Returns:
Integer pain score (higher = more pain)
"""
total = 0
for signal_key, signal_data in signals.items():
signal_info = signal_data.get('signal_info', PAIN_SIGNALS.get(signal_key, {}))
weight = signal_info.get('weight', 5)
count = signal_data.get('count', 1)
# Diminishing returns: first occurrence counts most
if count == 1:
total += weight
elif count <= 3:
total += weight * 1.5
else:
total += weight * 2
return int(total)
def get_primary_service(signals):
"""
Determine the primary service to pitch based on highest margin.
Args:
signals: Dictionary of detected signals
Returns:
Primary service name
"""
if not signals:
return None
# Margin priority: highest > high > medium > low
margin_priority = {'highest': 4, 'high': 3, 'medium': 2, 'low': 1}
best_service = None
best_margin = 0
best_weight = 0
for signal_key, signal_data in signals.items():
signal_info = signal_data.get('signal_info', PAIN_SIGNALS.get(signal_key, {}))
service = signal_info.get('service', 'General Digital Services')
margin = signal_info.get('margin', 'low')
weight = signal_info.get('weight', 5)
margin_score = margin_priority.get(margin, 1)
# Prefer higher margin, then higher weight
if (margin_score > best_margin or
(margin_score == best_margin and weight > best_weight)):
best_margin = margin_score
best_weight = weight
best_service = service
return best_service
def format_pain_summary(pain_data):
"""
Format pain data as human-readable summary.
Args:
pain_data: Dictionary from detect_pain_signals()
Returns:
Formatted string
"""
if not pain_data['signals']:
return "No pain signals detected"
lines = [
f"Pain Score: {pain_data['pain_score']}/100 ({pain_data['confidence']} confidence)",
f"Primary Service: {pain_data['primary_service'] or 'None'}",
f"Signals Detected: {pain_data['signal_count']}",
"",
"Details:"
]
for signal_key, signal_data in pain_data['signals'].items():
signal_info = signal_data.get('signal_info', {})
description = signal_info.get('description', signal_key)
count = signal_data.get('count', 1)
lines.append(f" - {description} (x{count})")
# Add example if available
examples = signal_data.get('examples', [])
if examples:
example = examples[0]
text = example.get('text', '')[:100]
lines.append(f" Example: \"{text}...\"")
return '\n'.join(lines)